+ Rule34XXX Crawler fully implemented
This commit is contained in:
parent
fe2372dc97
commit
50660517f5
|
@ -120,3 +120,6 @@ results.json
|
|||
|
||||
dist/*
|
||||
!dist/.keep
|
||||
|
||||
export/*
|
||||
!export/.keep
|
||||
|
|
|
@ -154,6 +154,9 @@ export class Rule34xxx extends Scrapper {
|
|||
|
||||
// Send out the request to grab the contents of the post
|
||||
try {
|
||||
if (this.verbose) {
|
||||
console.error(`Sniffing page...`);
|
||||
}
|
||||
// Send out the initial Axios request to fetch the data from the page
|
||||
await getPageContents(url)
|
||||
.then(request => {
|
||||
|
@ -170,8 +173,6 @@ export class Rule34xxx extends Scrapper {
|
|||
|
||||
pageContents = (request.data as string);
|
||||
})
|
||||
|
||||
|
||||
} catch (err) {
|
||||
// "Handle" the error so that it's in the above .catch
|
||||
this.logs.push({
|
||||
|
@ -200,10 +201,14 @@ export class Rule34xxx extends Scrapper {
|
|||
postList.push(`${self.domain}/${href}`);
|
||||
});
|
||||
|
||||
if (this.verbose) {
|
||||
console.error(`Found ${postList.length} posts`);
|
||||
}
|
||||
|
||||
return postList;
|
||||
}
|
||||
|
||||
public async crawlPages(url: string, pageCount: number = 10, batchSize: number = 1): Promise<Array<string>> {
|
||||
public async crawlPages(url: string, pageCount: number = 10, batchSize: number = 0): Promise<Array<string>> {
|
||||
|
||||
// Check if the provided link is valid
|
||||
if (!this.checkURLBase(url)) {
|
||||
|
|
65
src/test.ts
65
src/test.ts
|
@ -1,6 +1,7 @@
|
|||
// This is the test file for the library, different tests are ran in here.
|
||||
import {Rule34xxx} from "./module/rule34xxx";
|
||||
import {Post} from "./type/generic";
|
||||
import * as fs from "fs/promises";
|
||||
|
||||
( async () => {
|
||||
// Initialize the rule34 module
|
||||
|
@ -8,19 +9,73 @@ import {Post} from "./type/generic";
|
|||
r34.verbose = true;
|
||||
|
||||
// Run the get post Details function
|
||||
let postDetails: any;
|
||||
await r34.getPostDetails(`https://rule34.xxx/index.php?page=post&s=view&id=5203781`)
|
||||
let pageList: Array<string>;
|
||||
await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=sort%3Ascore%3Adesc+id%3A%3E4563063&pid=252`, 20)
|
||||
.then( postData => {
|
||||
postDetails = postData;
|
||||
pageList = postData;
|
||||
})
|
||||
.catch( err => {
|
||||
console.log(err);
|
||||
});
|
||||
|
||||
// Now grab all posts on all of those pages
|
||||
let postLinks: Array<string> = [];
|
||||
for ( let page of pageList ) {
|
||||
await r34.getPostsFromPage(page)
|
||||
.then( posts => {
|
||||
// Combine the two arrays
|
||||
postLinks = [...postLinks, ...posts];
|
||||
})
|
||||
.catch( err => {
|
||||
console.error(err);
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* The definitive list of posts
|
||||
*/
|
||||
const postList: Array<Post> = [];
|
||||
|
||||
/**
|
||||
* The amount of posts to fetch per cycle
|
||||
*/
|
||||
const concurrency: number = 3;
|
||||
|
||||
for ( let i = 0; i < postLinks.length; i++ ) {
|
||||
const promiseList: Array<Promise<Post>> = [];
|
||||
for ( let j = 0; j < concurrency; j++ ) {
|
||||
// Add the link to the crawler's buffer
|
||||
promiseList.push(r34.getPostDetails(postLinks[i]));
|
||||
i++;
|
||||
}
|
||||
|
||||
// Wait for all promises to settle
|
||||
await Promise.allSettled(promiseList)
|
||||
.then( result => {
|
||||
// Append the results to the postList
|
||||
for ( let p of result ) {
|
||||
postList.push(((p as any).value as Post));
|
||||
}
|
||||
})
|
||||
.catch( err => {
|
||||
console.error(`err: `, err);
|
||||
})
|
||||
.finally(() => {
|
||||
console.log(`[${i}/${postLinks.length}][${(i/postLinks.length * 100).toFixed(2)}%] Scrapping...`);
|
||||
})
|
||||
}
|
||||
|
||||
console.log(`Done!`);
|
||||
|
||||
await fs.writeFile(`./export/r34xxx_pageList_example.json`, JSON.stringify(pageList, null, 4));
|
||||
await fs.writeFile(`./export/r34xxx_postLinks_example.json`, JSON.stringify(postLinks, null, 4));
|
||||
await fs.writeFile(`./export/r34xxx_postList_example.json`, JSON.stringify(postList, null, 4));
|
||||
|
||||
// Display results
|
||||
console.log({
|
||||
logs: r34.logs,
|
||||
result: postDetails
|
||||
pageList: pageList.length,
|
||||
postLinks: postLinks.length,
|
||||
postList: postList.length,
|
||||
});
|
||||
})();
|
||||
|
||||
|
|
Reference in New Issue