From 2ed81fb668c4cc203909cb967d5acfd870740a75 Mon Sep 17 00:00:00 2001
From: Kato Twofold <kato@chunkbyte.com>
Date: Thu, 21 Oct 2021 02:14:34 +0300
Subject: [PATCH] + Some more work on rule34.xxx + rule34xxx Get all posts from
 a page + rule34xxx Crawl pages

---
 src/class/Scrapper.ts        |  88 +++++++++++++++----
 src/helper/requestManager.ts |  11 +++
 src/module/rule34xxx.ts      | 162 +++++++++++++++++++++++++++++++++--
 src/test.ts                  |  26 +++++-
 src/type/generic.ts          |  19 +++-
 5 files changed, 281 insertions(+), 25 deletions(-)
 create mode 100644 src/helper/requestManager.ts
diff --git a/src/class/Scrapper.ts b/src/class/Scrapper.ts
index 2db7286..b750069 100644
--- a/src/class/Scrapper.ts
+++ b/src/class/Scrapper.ts
@@ -1,4 +1,4 @@
-import {Post, Tag} from "../type/generic";
+import {Post, Tag, LogEntry, LogType} from "../type/generic";
 
 /**
  * The base class of the scrappers, any of the website scrappers must extend this class
@@ -13,29 +13,85 @@ export class Scrapper {
     /**
      * An array of all of the logs
      */
-    public logs: Array<any> = [];
+    public logs: Array<LogEntry> = [];
 
     /**
-     * The fully qualified domain of the website to scrap, for example "rule34.life"
+     * The fully qualified domain base without a trailing / of the website to scrap, for example "https://rule34.life"
      */
     public domain: string = ``;
 
     /**
-     * Get the details of a specific post
-     * @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
+     * Display console logs
      */
-    public async getPostDetails( url: string ): Promise<Post | null> {
-        return null;
-    }
+    public verbose: boolean = false;
 
-    /**
-     * Get a list of posts from the mentioned page
-     * @param url 
-     * @returns 
-     */
-    public async getPostsFromPage( url: string ): Promise<Array<Post>> {
-        return [];
-    }
+    // #region Protected Functions
 
+        protected checkURLBase(url: string) {
+            try {
+                // Try and build a new URL class
+                const instance: URL = new URL(url);
+
+                // Check if the origin matches ours
+                if (instance.origin == this.domain) {
+                    // Return success
+                    return true;
+                } else {
+                    this.logs.push({
+                        type: LogType.ERROR,
+                        msg: `Invalid URL provided`,
+                        data: {
+                            url:    this.domain,
+                            origin: instance.origin
+                        },
+                        err: null,
+                        ts: new Date(),
+                    });
+                }
+
+            } catch ( err ) {
+                this.logs.push({
+                    type: LogType.ERROR,
+                    msg:  `Failed to parse provided URL`,
+                    data: null,
+                    err:  (err as Error),
+                    ts:   new Date(),
+                });
+            }
+
+            // Return a failure
+            return false;
+        }
+
+    // #endregion
+
+    // #region Public Functions
+        /**
+         * Get the details of a specific post
+         * @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
+         */
+        public async getPostDetails( url: string ): Promise<Post | null> {
+            return null;
+        }
+
+        /**
+         * Get a list of posts from the mentioned page
+         * @param url 
+         * @returns 
+         */
+        public async getPostsFromPage( url: string ): Promise<Array<string>> {
+            return [];
+        }
+
+        /**
+         * Get a list of pages by starting to crawl from a specific page.
+         * @param url The starting page, this will crawl as many pages as you mention
+         * @param pageCount The number of pages to crawl
+         */
+        public async crawlPages( url: string, pageCount: number = 10 ): Promise<Array<string>> {
+            return [];
+        }
+
+    // #endregion
 
 }
\ No newline at end of file
diff --git a/src/helper/requestManager.ts b/src/helper/requestManager.ts
new file mode 100644
index 0000000..4e875d3
--- /dev/null
+++ b/src/helper/requestManager.ts
@@ -0,0 +1,11 @@
+import * as axiosPackage from 'axios';
+const axios = axiosPackage.default;
+
+export function getPageContents(url: string): Promise<axiosPackage.AxiosResponse<unknown, any>> {
+    // Return the axios function's promise
+    return axios.get(url, {
+        headers: { 
+            'User-Agent': 'Mozilla/5.0',
+        }
+    });
+}
\ No newline at end of file
diff --git a/src/module/rule34xxx.ts b/src/module/rule34xxx.ts
index 7b0a2dd..7922101 100644
--- a/src/module/rule34xxx.ts
+++ b/src/module/rule34xxx.ts
@@ -1,17 +1,169 @@
-import {Post, Tag} from "../type/generic";
-import {Scrapper} from "../class/Scrapper";
+import {Post, Tag, LogEntry, LogType} from "../type/generic";
+import {Scrapper}                     from "../class/Scrapper";
+import {getPageContents}              from "../helper/requestManager";
+import * as cheerio from 'cheerio';
 
-class Rule34xxx extends Scrapper {
+export class Rule34xxx extends Scrapper {
+
+    constructor() {
+        // Set the domain base of the current Scrapper as "rule34.xxx"
+        super("https://rule34.xxx");
+    }
 
     /**
      * Get the details of a specific post
      * @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
      */
     public async getPostDetails( url: string ): Promise<Post | null> {
-        
-        
+
+        // Check if the provided link is valid
+        if ( !this.checkURLBase(url) ) {
+            throw new Error(`Invalid url provided`);
+        }
+
+        // Send out the request to grab the contents of the post
+
 
         return null;
     }
 
+    /**
+     * Get a list of posts from the mentioned page
+     * @param url 
+     * @returns 
+     */
+    public async getPostsFromPage( url: string ): Promise<Array<string>> {
+
+        // Check if the provided link is valid
+        if ( !this.checkURLBase(url) ) {
+            throw new Error(`Invalid url provided`);
+        }
+
+        // Initialize the page contents here
+        let pageContents: string = null;
+
+        // Send out the request to grab the contents of the post
+        try {
+            // Send out the initial Axios request to fetch the data from the page
+            await getPageContents(url)
+            .then( request => {
+                if ( request.status < 200 || request.status > 299 ) {
+                    this.logs.push({
+                        msg: `Invalid response code[${request.status}]`, 
+                        type: LogType.ERROR,
+                        err:  null,
+                        data: null,
+                        ts: new Date()
+                    });
+                    throw new Error(`Invalid response code[${request.status}]`);
+                }
+        
+                pageContents = (request.data as string);
+            })
+    
+    
+        } catch ( err ) {
+            // "Handle" the error so that it's in the above .catch
+            this.logs.push({
+                msg: `[Error]::getPostsFromPage::`, 
+                type: LogType.ERROR,
+                err: (err as Error),
+                data: null,
+                ts: new Date()
+            });
+            throw err;
+        }
+
+        // Process the page's posts with cheerio
+        const $ = cheerio.load((pageContents as string));
+
+        // Define the post List
+        const postList: Array<string> = [];
+
+        // Workaround I guess
+        let self = this;
+
+        // Go through all of the posts
+        $(`.thumb`).each( function() {
+            const href = $(this).find(`a`).attr(`href`);
+            if ( `${href}`.length >= `index.php?page=post&s=view&id=`.length )
+                postList.push(`${self.domain}/${href}`);
+        });
+
+        return postList;
+    }
+
+    public async crawlPages( url: string, pageCount: number = 10, batchSize: number = 1 ): Promise<Array<string>> {
+
+        // Check if the provided link is valid
+        if ( !this.checkURLBase(url) ) {
+            throw new Error(`Invalid url provided`);
+        }
+
+        // A list of all of the found pages
+        let foundPages = new Array<string>();
+        // The next url we are hitting
+        let nextPage: string = url;
+
+        // Go through as many pages as requested
+        for ( let i = 0; i < pageCount; i++ ) {
+
+            if ( this.verbose ) {
+                console.log(`[${i+1}/${pageCount}]Crawling ${nextPage}`);
+            }
+
+            // Initialize the page contents here
+            let pageContents: string = null;
+
+            // Try and find the button to the next page
+            try {
+                // Send out the initial Axios request to fetch the data from the page
+                await getPageContents(nextPage)
+                .then( request => {
+                    if ( request.status < 200 || request.status > 299 ) {
+                        this.logs.push({
+                            msg: `Invalid response code[${request.status}]`, 
+                            type: LogType.ERROR,
+                            err:  null,
+                            data: null,
+                            ts: new Date()
+                        });
+                        throw new Error(`Invalid response code[${request.status}]`);
+                    }
+            
+                    pageContents = (request.data as string);
+                })
+        
+        
+            } catch ( err ) {
+                // "Handle" the error so that it's in the above .catch
+                this.logs.push({
+                    msg: `[Error]::getPostsFromPage::`, 
+                    type: LogType.ERROR,
+                    err: (err as Error),
+                    data: null,
+                    ts: new Date()
+                });
+                throw err;
+            }
+
+            // Process the page's posts with cheerio
+            const $ = cheerio.load((pageContents as string));
+
+            // Add the current page we are on to the list
+            foundPages.push(nextPage);
+
+            const nextPageButton = $(`a[alt="next"]`);
+            if ( nextPageButton.length > 0 ) {
+                nextPage = `${this.domain}/` + nextPageButton.attr(`href`);
+            } else {
+                // Since we didn't find the proper button, skip this page.
+                break;
+            }
+        }
+
+        // Return the found pages
+        return foundPages;
+    }
+
 }
\ No newline at end of file
diff --git a/src/test.ts b/src/test.ts
index 61b3bfd..eae0a3b 100644
--- a/src/test.ts
+++ b/src/test.ts
@@ -1,2 +1,26 @@
 // This is the test file for the library, different tests are ran in here.
-console.log(`Working I guess`);
\ No newline at end of file
+import {Rule34xxx} from "./module/rule34xxx";
+import {Post} from "./type/generic";
+
+( async () => {
+    // Initialize the rule34 module
+    const r34: Rule34xxx = new Rule34xxx();
+    r34.verbose = true;
+    
+    // Run the get post Details function
+    let postDetails: Array<string>;
+    await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=thelorope`, 35)
+    .then(  postData => {
+        postDetails = postData;
+    })
+    .catch( err => { 
+        console.log(err); 
+    });
+
+    // Display results
+    console.log({
+        logs: r34.logs, 
+        result: postDetails
+    });
+})();
+
diff --git a/src/type/generic.ts b/src/type/generic.ts
index 0f980af..99ccab5 100644
--- a/src/type/generic.ts
+++ b/src/type/generic.ts
@@ -14,12 +14,12 @@ export interface Post {
     /**
      * URL to the original post link
      */
-    url?: string,
+    url: string,
 
     /**
      * A link to the full resolution image or video
      */
-    contentURL: string,
+    contentURL?: string,
 
     /**
      * The optional link for the source of the image
@@ -29,10 +29,23 @@ export interface Post {
     /**
      * A list of all of the tags the post has
      */
-    tags: Array<Tag>,
+    tags?: Array<Tag>,
 
     /**
      * The date of the post's creation
      */
     ts?: string,
+}
+
+export enum LogType {
+    ERROR = `error`,
+    INFO  = `info`,
+}
+
+export interface LogEntry {
+    type:   LogType,
+    msg:    string,
+    data:   any,
+    err:    null | Error,
+    ts:     Date,
 }
\ No newline at end of file