+ Rule34XXX Crawler fully implemented

2021-10-22 00:26:18 +03:00 · 2021-10-22 00:26:18 +03:00 · 50660517f5
parent fe2372dc97
commit 50660517f5
4 changed files with 72 additions and 9 deletions
--- a/.gitignore
+++ b/.gitignore
@ -120,3 +120,6 @@ results.json

 dist/*
 !dist/.keep
+
+export/*
+!export/.keep
--- a/export/.keep
+++ b/export/.keep
--- a/src/module/rule34xxx.ts
+++ b/src/module/rule34xxx.ts
@ -154,6 +154,9 @@ export class Rule34xxx extends Scrapper {

        // Send out the request to grab the contents of the post
        try {
+            if (this.verbose) {
+                console.error(`Sniffing page...`);
+            }
            // Send out the initial Axios request to fetch the data from the page
            await getPageContents(url)
                .then(request => {
@ -170,8 +173,6 @@ export class Rule34xxx extends Scrapper {

                    pageContents = (request.data as string);
                })
-
-
        } catch (err) {
            // "Handle" the error so that it's in the above .catch
            this.logs.push({
@ -200,10 +201,14 @@ export class Rule34xxx extends Scrapper {
                postList.push(`${self.domain}/${href}`);
        });

+        if (this.verbose) {
+            console.error(`Found ${postList.length} posts`);
+        }
+
        return postList;
    }

-    public async crawlPages(url: string, pageCount: number = 10, batchSize: number = 1): Promise<Array<string>> {
+    public async crawlPages(url: string, pageCount: number = 10, batchSize: number = 0): Promise<Array<string>> {

        // Check if the provided link is valid
        if (!this.checkURLBase(url)) {
--- a/src/test.ts
+++ b/src/test.ts
@ -1,6 +1,7 @@
 // This is the test file for the library, different tests are ran in here.
 import {Rule34xxx} from "./module/rule34xxx";
 import {Post} from "./type/generic";
+import * as fs from "fs/promises";

 ( async () => {
    // Initialize the rule34 module
@ -8,19 +9,73 @@ import {Post} from "./type/generic";
    r34.verbose = true;
    
    // Run the get post Details function
-    let postDetails: any;
-    await r34.getPostDetails(`https://rule34.xxx/index.php?page=post&s=view&id=5203781`)
+    let pageList: Array<string>;
+    await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=sort%3Ascore%3Adesc+id%3A%3E4563063&pid=252`, 20)
    .then(  postData => {
-        postDetails = postData;
+        pageList = postData;
    })
    .catch( err => { 
        console.log(err); 
    });

+    // Now grab all posts on all of those pages
+    let postLinks: Array<string> = [];
+    for ( let page of pageList ) {
+        await r34.getPostsFromPage(page)
+        .then( posts => {
+            // Combine the two arrays
+            postLinks = [...postLinks, ...posts];
+        })
+        .catch( err => {
+            console.error(err);
+        })
+    }
+
+    /**
+     * The definitive list of posts
+     */
+    const postList: Array<Post> = [];
+    
+    /**
+     * The amount of posts to fetch per cycle
+     */
+    const concurrency: number = 3;
+
+    for ( let i = 0; i < postLinks.length; i++ ) {
+        const promiseList: Array<Promise<Post>> = [];
+        for ( let j = 0; j < concurrency; j++ ) {
+            // Add the link to the crawler's buffer
+            promiseList.push(r34.getPostDetails(postLinks[i]));
+            i++;
+        }
+
+        // Wait for all promises to settle
+        await Promise.allSettled(promiseList)
+        .then( result => {
+            // Append the results to the postList
+            for ( let p of result ) {
+                postList.push(((p as any).value as Post));
+            }
+        })
+        .catch( err => {
+            console.error(`err: `, err);
+        })
+        .finally(() => {
+            console.log(`[${i}/${postLinks.length}][${(i/postLinks.length * 100).toFixed(2)}%] Scrapping...`);
+        })
+    }
+
+    console.log(`Done!`);
+
+    await fs.writeFile(`./export/r34xxx_pageList_example.json`, JSON.stringify(pageList, null, 4));
+    await fs.writeFile(`./export/r34xxx_postLinks_example.json`, JSON.stringify(postLinks, null, 4));
+    await fs.writeFile(`./export/r34xxx_postList_example.json`, JSON.stringify(postList, null, 4));
+
    // Display results
    console.log({
-        logs: r34.logs, 
-        result: postDetails
+        pageList:  pageList.length,
+        postLinks: postLinks.length,
+        postList:  postList.length,
    });
 })();