diff --git a/.gitignore b/.gitignore index 1622c46..5769530 100644 --- a/.gitignore +++ b/.gitignore @@ -119,4 +119,7 @@ out results.json dist/* -!dist/.keep \ No newline at end of file +!dist/.keep + +export/* +!export/.keep diff --git a/export/.keep b/export/.keep new file mode 100644 index 0000000..e69de29 diff --git a/src/module/rule34xxx.ts b/src/module/rule34xxx.ts index be0d6d6..98a6b38 100644 --- a/src/module/rule34xxx.ts +++ b/src/module/rule34xxx.ts @@ -154,6 +154,9 @@ export class Rule34xxx extends Scrapper { // Send out the request to grab the contents of the post try { + if (this.verbose) { + console.error(`Sniffing page...`); + } // Send out the initial Axios request to fetch the data from the page await getPageContents(url) .then(request => { @@ -170,8 +173,6 @@ export class Rule34xxx extends Scrapper { pageContents = (request.data as string); }) - - } catch (err) { // "Handle" the error so that it's in the above .catch this.logs.push({ @@ -200,10 +201,14 @@ export class Rule34xxx extends Scrapper { postList.push(`${self.domain}/${href}`); }); + if (this.verbose) { + console.error(`Found ${postList.length} posts`); + } + return postList; } - public async crawlPages(url: string, pageCount: number = 10, batchSize: number = 1): Promise> { + public async crawlPages(url: string, pageCount: number = 10, batchSize: number = 0): Promise> { // Check if the provided link is valid if (!this.checkURLBase(url)) { diff --git a/src/test.ts b/src/test.ts index 8ba10df..f8e176f 100644 --- a/src/test.ts +++ b/src/test.ts @@ -1,6 +1,7 @@ // This is the test file for the library, different tests are ran in here. import {Rule34xxx} from "./module/rule34xxx"; import {Post} from "./type/generic"; +import * as fs from "fs/promises"; ( async () => { // Initialize the rule34 module @@ -8,19 +9,73 @@ import {Post} from "./type/generic"; r34.verbose = true; // Run the get post Details function - let postDetails: any; - await r34.getPostDetails(`https://rule34.xxx/index.php?page=post&s=view&id=5203781`) + let pageList: Array; + await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=sort%3Ascore%3Adesc+id%3A%3E4563063&pid=252`, 20) .then( postData => { - postDetails = postData; + pageList = postData; }) .catch( err => { console.log(err); }); + // Now grab all posts on all of those pages + let postLinks: Array = []; + for ( let page of pageList ) { + await r34.getPostsFromPage(page) + .then( posts => { + // Combine the two arrays + postLinks = [...postLinks, ...posts]; + }) + .catch( err => { + console.error(err); + }) + } + + /** + * The definitive list of posts + */ + const postList: Array = []; + + /** + * The amount of posts to fetch per cycle + */ + const concurrency: number = 3; + + for ( let i = 0; i < postLinks.length; i++ ) { + const promiseList: Array> = []; + for ( let j = 0; j < concurrency; j++ ) { + // Add the link to the crawler's buffer + promiseList.push(r34.getPostDetails(postLinks[i])); + i++; + } + + // Wait for all promises to settle + await Promise.allSettled(promiseList) + .then( result => { + // Append the results to the postList + for ( let p of result ) { + postList.push(((p as any).value as Post)); + } + }) + .catch( err => { + console.error(`err: `, err); + }) + .finally(() => { + console.log(`[${i}/${postLinks.length}][${(i/postLinks.length * 100).toFixed(2)}%] Scrapping...`); + }) + } + + console.log(`Done!`); + + await fs.writeFile(`./export/r34xxx_pageList_example.json`, JSON.stringify(pageList, null, 4)); + await fs.writeFile(`./export/r34xxx_postLinks_example.json`, JSON.stringify(postLinks, null, 4)); + await fs.writeFile(`./export/r34xxx_postList_example.json`, JSON.stringify(postList, null, 4)); + // Display results console.log({ - logs: r34.logs, - result: postDetails + pageList: pageList.length, + postLinks: postLinks.length, + postList: postList.length, }); })();