+ Some more work on rule34.xxx

+ rule34xxx Get all posts from a page + rule34xxx Crawl pages
2021-10-21 02:14:34 +03:00 · 2021-10-21 02:14:34 +03:00 · 2ed81fb668
parent 11224096f9
commit 2ed81fb668
5 changed files with 281 additions and 25 deletions
--- a/src/class/Scrapper.ts
+++ b/src/class/Scrapper.ts
@ -1,4 +1,4 @@
-import {Post, Tag} from "../type/generic";
+import {Post, Tag, LogEntry, LogType} from "../type/generic";
 /**
 * The base class of the scrappers, any of the website scrappers must extend this class
@ -13,29 +13,85 @@ export class Scrapper {
    /**
     * An array of all of the logs
     */
-    public logs: Array<any> = [];
+    public logs: Array<LogEntry> = [];
    /**
-     * The fully qualified domain of the website to scrap, for example "rule34.life"
+     * The fully qualified domain base without a trailing / of the website to scrap, for example "https://rule34.life"
     */
    public domain: string = ``;
    /**
-     * Get the details of a specific post
+     * Display console logs
     * @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
     */
-    public async getPostDetails( url: string ): Promise<Post | null> {
+    public verbose: boolean = false;
        return null;
    }
-    /**
+    // #region Protected Functions
     * Get a list of posts from the mentioned page
     * @param url 
     * @returns 
     */
    public async getPostsFromPage( url: string ): Promise<Array<Post>> {
        return [];
    }
        protected checkURLBase(url: string) {
            try {
                // Try and build a new URL class
                const instance: URL = new URL(url);
                // Check if the origin matches ours
                if (instance.origin == this.domain) {
                    // Return success
                    return true;
                } else {
                    this.logs.push({
                        type: LogType.ERROR,
                        msg: `Invalid URL provided`,
                        data: {
                            url:    this.domain,
                            origin: instance.origin
                        },
                        err: null,
                        ts: new Date(),
                    });
                }
            } catch ( err ) {
                this.logs.push({
                    type: LogType.ERROR,
                    msg:  `Failed to parse provided URL`,
                    data: null,
                    err:  (err as Error),
                    ts:   new Date(),
                });
            }
            // Return a failure
            return false;
        }
    // #endregion
    // #region Public Functions
        /**
         * Get the details of a specific post
         * @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
         */
        public async getPostDetails( url: string ): Promise<Post | null> {
            return null;
        }
        /**
         * Get a list of posts from the mentioned page
         * @param url 
         * @returns 
         */
        public async getPostsFromPage( url: string ): Promise<Array<string>> {
            return [];
        }
        /**
         * Get a list of pages by starting to crawl from a specific page.
         * @param url The starting page, this will crawl as many pages as you mention
         * @param pageCount The number of pages to crawl
         */
        public async crawlPages( url: string, pageCount: number = 10 ): Promise<Array<string>> {
            return [];
        }
    // #endregion
 }
--- a/src/helper/requestManager.ts
+++ b/src/helper/requestManager.ts
@ -0,0 +1,11 @@
 import * as axiosPackage from 'axios';
 const axios = axiosPackage.default;
 export function getPageContents(url: string): Promise<axiosPackage.AxiosResponse<unknown, any>> {
    // Return the axios function's promise
    return axios.get(url, {
        headers: { 
            'User-Agent': 'Mozilla/5.0',
        }
    });
 }
--- a/src/module/rule34xxx.ts
+++ b/src/module/rule34xxx.ts
@ -1,17 +1,169 @@
-import {Post, Tag} from "../type/generic";
+import {Post, Tag, LogEntry, LogType} from "../type/generic";
-import {Scrapper} from "../class/Scrapper";
+import {Scrapper}                     from "../class/Scrapper";
 import {getPageContents}              from "../helper/requestManager";
 import * as cheerio from 'cheerio';
-class Rule34xxx extends Scrapper {
+export class Rule34xxx extends Scrapper {
    constructor() {
        // Set the domain base of the current Scrapper as "rule34.xxx"
        super("https://rule34.xxx");
    }
    /**
     * Get the details of a specific post
     * @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
     */
    public async getPostDetails( url: string ): Promise<Post | null> {
-        
+
-        
+        // Check if the provided link is valid
        if ( !this.checkURLBase(url) ) {
            throw new Error(`Invalid url provided`);
        }
        // Send out the request to grab the contents of the post
        return null;
    }
    /**
     * Get a list of posts from the mentioned page
     * @param url 
     * @returns 
     */
    public async getPostsFromPage( url: string ): Promise<Array<string>> {
        // Check if the provided link is valid
        if ( !this.checkURLBase(url) ) {
            throw new Error(`Invalid url provided`);
        }
        // Initialize the page contents here
        let pageContents: string = null;
        // Send out the request to grab the contents of the post
        try {
            // Send out the initial Axios request to fetch the data from the page
            await getPageContents(url)
            .then( request => {
                if ( request.status < 200 || request.status > 299 ) {
                    this.logs.push({
                        msg: `Invalid response code[${request.status}]`, 
                        type: LogType.ERROR,
                        err:  null,
                        data: null,
                        ts: new Date()
                    });
                    throw new Error(`Invalid response code[${request.status}]`);
                }
                pageContents = (request.data as string);
            })
        } catch ( err ) {
            // "Handle" the error so that it's in the above .catch
            this.logs.push({
                msg: `[Error]::getPostsFromPage::`, 
                type: LogType.ERROR,
                err: (err as Error),
                data: null,
                ts: new Date()
            });
            throw err;
        }
        // Process the page's posts with cheerio
        const $ = cheerio.load((pageContents as string));
        // Define the post List
        const postList: Array<string> = [];
        // Workaround I guess
        let self = this;
        // Go through all of the posts
        $(`.thumb`).each( function() {
            const href = $(this).find(`a`).attr(`href`);
            if ( `${href}`.length >= `index.php?page=post&s=view&id=`.length )
                postList.push(`${self.domain}/${href}`);
        });
        return postList;
    }
    public async crawlPages( url: string, pageCount: number = 10, batchSize: number = 1 ): Promise<Array<string>> {
        // Check if the provided link is valid
        if ( !this.checkURLBase(url) ) {
            throw new Error(`Invalid url provided`);
        }
        // A list of all of the found pages
        let foundPages = new Array<string>();
        // The next url we are hitting
        let nextPage: string = url;
        // Go through as many pages as requested
        for ( let i = 0; i < pageCount; i++ ) {
            if ( this.verbose ) {
                console.log(`[${i+1}/${pageCount}]Crawling ${nextPage}`);
            }
            // Initialize the page contents here
            let pageContents: string = null;
            // Try and find the button to the next page
            try {
                // Send out the initial Axios request to fetch the data from the page
                await getPageContents(nextPage)
                .then( request => {
                    if ( request.status < 200 || request.status > 299 ) {
                        this.logs.push({
                            msg: `Invalid response code[${request.status}]`, 
                            type: LogType.ERROR,
                            err:  null,
                            data: null,
                            ts: new Date()
                        });
                        throw new Error(`Invalid response code[${request.status}]`);
                    }
                    pageContents = (request.data as string);
                })
            } catch ( err ) {
                // "Handle" the error so that it's in the above .catch
                this.logs.push({
                    msg: `[Error]::getPostsFromPage::`, 
                    type: LogType.ERROR,
                    err: (err as Error),
                    data: null,
                    ts: new Date()
                });
                throw err;
            }
            // Process the page's posts with cheerio
            const $ = cheerio.load((pageContents as string));
            // Add the current page we are on to the list
            foundPages.push(nextPage);
            const nextPageButton = $(`a[alt="next"]`);
            if ( nextPageButton.length > 0 ) {
                nextPage = `${this.domain}/` + nextPageButton.attr(`href`);
            } else {
                // Since we didn't find the proper button, skip this page.
                break;
            }
        }
        // Return the found pages
        return foundPages;
    }
 }
--- a/src/test.ts
+++ b/src/test.ts
@ -1,2 +1,26 @@
 // This is the test file for the library, different tests are ran in here.
-console.log(`Working I guess`);
+import {Rule34xxx} from "./module/rule34xxx";
 import {Post} from "./type/generic";
 ( async () => {
    // Initialize the rule34 module
    const r34: Rule34xxx = new Rule34xxx();
    r34.verbose = true;
    // Run the get post Details function
    let postDetails: Array<string>;
    await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=thelorope`, 35)
    .then(  postData => {
        postDetails = postData;
    })
    .catch( err => { 
        console.log(err); 
    });
    // Display results
    console.log({
        logs: r34.logs, 
        result: postDetails
    });
 })();
--- a/src/type/generic.ts
+++ b/src/type/generic.ts
@ -14,12 +14,12 @@ export interface Post {
    /**
     * URL to the original post link
     */
-    url?: string,
+    url: string,
    /**
     * A link to the full resolution image or video
     */
-    contentURL: string,
+    contentURL?: string,
    /**
     * The optional link for the source of the image
@ -29,10 +29,23 @@ export interface Post {
    /**
     * A list of all of the tags the post has
     */
-    tags: Array<Tag>,
+    tags?: Array<Tag>,
    /**
     * The date of the post's creation
     */
    ts?: string,
 }
 export enum LogType {
    ERROR = `error`,
    INFO  = `info`,
 }
 export interface LogEntry {
    type:   LogType,
    msg:    string,
    data:   any,
    err:    null | Error,
    ts:     Date,
 }