From 2ed81fb668c4cc203909cb967d5acfd870740a75 Mon Sep 17 00:00:00 2001 From: Kato Twofold Date: Thu, 21 Oct 2021 02:14:34 +0300 Subject: [PATCH] + Some more work on rule34.xxx + rule34xxx Get all posts from a page + rule34xxx Crawl pages --- src/class/Scrapper.ts | 88 +++++++++++++++---- src/helper/requestManager.ts | 11 +++ src/module/rule34xxx.ts | 162 +++++++++++++++++++++++++++++++++-- src/test.ts | 26 +++++- src/type/generic.ts | 19 +++- 5 files changed, 281 insertions(+), 25 deletions(-) create mode 100644 src/helper/requestManager.ts diff --git a/src/class/Scrapper.ts b/src/class/Scrapper.ts index 2db7286..b750069 100644 --- a/src/class/Scrapper.ts +++ b/src/class/Scrapper.ts @@ -1,4 +1,4 @@ -import {Post, Tag} from "../type/generic"; +import {Post, Tag, LogEntry, LogType} from "../type/generic"; /** * The base class of the scrappers, any of the website scrappers must extend this class @@ -13,29 +13,85 @@ export class Scrapper { /** * An array of all of the logs */ - public logs: Array = []; + public logs: Array = []; /** - * The fully qualified domain of the website to scrap, for example "rule34.life" + * The fully qualified domain base without a trailing / of the website to scrap, for example "https://rule34.life" */ public domain: string = ``; /** - * Get the details of a specific post - * @param url The URL to the post, this must be the actual page which contains the image, tags, etc... + * Display console logs */ - public async getPostDetails( url: string ): Promise { - return null; - } + public verbose: boolean = false; - /** - * Get a list of posts from the mentioned page - * @param url - * @returns - */ - public async getPostsFromPage( url: string ): Promise> { - return []; - } + // #region Protected Functions + protected checkURLBase(url: string) { + try { + // Try and build a new URL class + const instance: URL = new URL(url); + + // Check if the origin matches ours + if (instance.origin == this.domain) { + // Return success + return true; + } else { + this.logs.push({ + type: LogType.ERROR, + msg: `Invalid URL provided`, + data: { + url: this.domain, + origin: instance.origin + }, + err: null, + ts: new Date(), + }); + } + + } catch ( err ) { + this.logs.push({ + type: LogType.ERROR, + msg: `Failed to parse provided URL`, + data: null, + err: (err as Error), + ts: new Date(), + }); + } + + // Return a failure + return false; + } + + // #endregion + + // #region Public Functions + /** + * Get the details of a specific post + * @param url The URL to the post, this must be the actual page which contains the image, tags, etc... + */ + public async getPostDetails( url: string ): Promise { + return null; + } + + /** + * Get a list of posts from the mentioned page + * @param url + * @returns + */ + public async getPostsFromPage( url: string ): Promise> { + return []; + } + + /** + * Get a list of pages by starting to crawl from a specific page. + * @param url The starting page, this will crawl as many pages as you mention + * @param pageCount The number of pages to crawl + */ + public async crawlPages( url: string, pageCount: number = 10 ): Promise> { + return []; + } + + // #endregion } \ No newline at end of file diff --git a/src/helper/requestManager.ts b/src/helper/requestManager.ts new file mode 100644 index 0000000..4e875d3 --- /dev/null +++ b/src/helper/requestManager.ts @@ -0,0 +1,11 @@ +import * as axiosPackage from 'axios'; +const axios = axiosPackage.default; + +export function getPageContents(url: string): Promise> { + // Return the axios function's promise + return axios.get(url, { + headers: { + 'User-Agent': 'Mozilla/5.0', + } + }); +} \ No newline at end of file diff --git a/src/module/rule34xxx.ts b/src/module/rule34xxx.ts index 7b0a2dd..7922101 100644 --- a/src/module/rule34xxx.ts +++ b/src/module/rule34xxx.ts @@ -1,17 +1,169 @@ -import {Post, Tag} from "../type/generic"; -import {Scrapper} from "../class/Scrapper"; +import {Post, Tag, LogEntry, LogType} from "../type/generic"; +import {Scrapper} from "../class/Scrapper"; +import {getPageContents} from "../helper/requestManager"; +import * as cheerio from 'cheerio'; -class Rule34xxx extends Scrapper { +export class Rule34xxx extends Scrapper { + + constructor() { + // Set the domain base of the current Scrapper as "rule34.xxx" + super("https://rule34.xxx"); + } /** * Get the details of a specific post * @param url The URL to the post, this must be the actual page which contains the image, tags, etc... */ public async getPostDetails( url: string ): Promise { - - + + // Check if the provided link is valid + if ( !this.checkURLBase(url) ) { + throw new Error(`Invalid url provided`); + } + + // Send out the request to grab the contents of the post + return null; } + /** + * Get a list of posts from the mentioned page + * @param url + * @returns + */ + public async getPostsFromPage( url: string ): Promise> { + + // Check if the provided link is valid + if ( !this.checkURLBase(url) ) { + throw new Error(`Invalid url provided`); + } + + // Initialize the page contents here + let pageContents: string = null; + + // Send out the request to grab the contents of the post + try { + // Send out the initial Axios request to fetch the data from the page + await getPageContents(url) + .then( request => { + if ( request.status < 200 || request.status > 299 ) { + this.logs.push({ + msg: `Invalid response code[${request.status}]`, + type: LogType.ERROR, + err: null, + data: null, + ts: new Date() + }); + throw new Error(`Invalid response code[${request.status}]`); + } + + pageContents = (request.data as string); + }) + + + } catch ( err ) { + // "Handle" the error so that it's in the above .catch + this.logs.push({ + msg: `[Error]::getPostsFromPage::`, + type: LogType.ERROR, + err: (err as Error), + data: null, + ts: new Date() + }); + throw err; + } + + // Process the page's posts with cheerio + const $ = cheerio.load((pageContents as string)); + + // Define the post List + const postList: Array = []; + + // Workaround I guess + let self = this; + + // Go through all of the posts + $(`.thumb`).each( function() { + const href = $(this).find(`a`).attr(`href`); + if ( `${href}`.length >= `index.php?page=post&s=view&id=`.length ) + postList.push(`${self.domain}/${href}`); + }); + + return postList; + } + + public async crawlPages( url: string, pageCount: number = 10, batchSize: number = 1 ): Promise> { + + // Check if the provided link is valid + if ( !this.checkURLBase(url) ) { + throw new Error(`Invalid url provided`); + } + + // A list of all of the found pages + let foundPages = new Array(); + // The next url we are hitting + let nextPage: string = url; + + // Go through as many pages as requested + for ( let i = 0; i < pageCount; i++ ) { + + if ( this.verbose ) { + console.log(`[${i+1}/${pageCount}]Crawling ${nextPage}`); + } + + // Initialize the page contents here + let pageContents: string = null; + + // Try and find the button to the next page + try { + // Send out the initial Axios request to fetch the data from the page + await getPageContents(nextPage) + .then( request => { + if ( request.status < 200 || request.status > 299 ) { + this.logs.push({ + msg: `Invalid response code[${request.status}]`, + type: LogType.ERROR, + err: null, + data: null, + ts: new Date() + }); + throw new Error(`Invalid response code[${request.status}]`); + } + + pageContents = (request.data as string); + }) + + + } catch ( err ) { + // "Handle" the error so that it's in the above .catch + this.logs.push({ + msg: `[Error]::getPostsFromPage::`, + type: LogType.ERROR, + err: (err as Error), + data: null, + ts: new Date() + }); + throw err; + } + + // Process the page's posts with cheerio + const $ = cheerio.load((pageContents as string)); + + // Add the current page we are on to the list + foundPages.push(nextPage); + + const nextPageButton = $(`a[alt="next"]`); + if ( nextPageButton.length > 0 ) { + nextPage = `${this.domain}/` + nextPageButton.attr(`href`); + } else { + // Since we didn't find the proper button, skip this page. + break; + } + } + + // Return the found pages + return foundPages; + } + } \ No newline at end of file diff --git a/src/test.ts b/src/test.ts index 61b3bfd..eae0a3b 100644 --- a/src/test.ts +++ b/src/test.ts @@ -1,2 +1,26 @@ // This is the test file for the library, different tests are ran in here. -console.log(`Working I guess`); \ No newline at end of file +import {Rule34xxx} from "./module/rule34xxx"; +import {Post} from "./type/generic"; + +( async () => { + // Initialize the rule34 module + const r34: Rule34xxx = new Rule34xxx(); + r34.verbose = true; + + // Run the get post Details function + let postDetails: Array; + await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=thelorope`, 35) + .then( postData => { + postDetails = postData; + }) + .catch( err => { + console.log(err); + }); + + // Display results + console.log({ + logs: r34.logs, + result: postDetails + }); +})(); + diff --git a/src/type/generic.ts b/src/type/generic.ts index 0f980af..99ccab5 100644 --- a/src/type/generic.ts +++ b/src/type/generic.ts @@ -14,12 +14,12 @@ export interface Post { /** * URL to the original post link */ - url?: string, + url: string, /** * A link to the full resolution image or video */ - contentURL: string, + contentURL?: string, /** * The optional link for the source of the image @@ -29,10 +29,23 @@ export interface Post { /** * A list of all of the tags the post has */ - tags: Array, + tags?: Array, /** * The date of the post's creation */ ts?: string, +} + +export enum LogType { + ERROR = `error`, + INFO = `info`, +} + +export interface LogEntry { + type: LogType, + msg: string, + data: any, + err: null | Error, + ts: Date, } \ No newline at end of file