+ Some more work on rule34.xxx
+ rule34xxx Get all posts from a page + rule34xxx Crawl pages
This commit is contained in:
		
							parent
							
								
									11224096f9
								
							
						
					
					
						commit
						2ed81fb668
					
				| 
						 | 
				
			
			@ -1,4 +1,4 @@
 | 
			
		|||
import {Post, Tag} from "../type/generic";
 | 
			
		||||
import {Post, Tag, LogEntry, LogType} from "../type/generic";
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * The base class of the scrappers, any of the website scrappers must extend this class
 | 
			
		||||
| 
						 | 
				
			
			@ -13,29 +13,85 @@ export class Scrapper {
 | 
			
		|||
    /**
 | 
			
		||||
     * An array of all of the logs
 | 
			
		||||
     */
 | 
			
		||||
    public logs: Array<any> = [];
 | 
			
		||||
    public logs: Array<LogEntry> = [];
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * The fully qualified domain of the website to scrap, for example "rule34.life"
 | 
			
		||||
     * The fully qualified domain base without a trailing / of the website to scrap, for example "https://rule34.life"
 | 
			
		||||
     */
 | 
			
		||||
    public domain: string = ``;
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Get the details of a specific post
 | 
			
		||||
     * @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
 | 
			
		||||
     * Display console logs
 | 
			
		||||
     */
 | 
			
		||||
    public async getPostDetails( url: string ): Promise<Post | null> {
 | 
			
		||||
        return null;
 | 
			
		||||
    }
 | 
			
		||||
    public verbose: boolean = false;
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Get a list of posts from the mentioned page
 | 
			
		||||
     * @param url 
 | 
			
		||||
     * @returns 
 | 
			
		||||
     */
 | 
			
		||||
    public async getPostsFromPage( url: string ): Promise<Array<Post>> {
 | 
			
		||||
        return [];
 | 
			
		||||
    }
 | 
			
		||||
    // #region Protected Functions
 | 
			
		||||
 | 
			
		||||
        protected checkURLBase(url: string) {
 | 
			
		||||
            try {
 | 
			
		||||
                // Try and build a new URL class
 | 
			
		||||
                const instance: URL = new URL(url);
 | 
			
		||||
 | 
			
		||||
                // Check if the origin matches ours
 | 
			
		||||
                if (instance.origin == this.domain) {
 | 
			
		||||
                    // Return success
 | 
			
		||||
                    return true;
 | 
			
		||||
                } else {
 | 
			
		||||
                    this.logs.push({
 | 
			
		||||
                        type: LogType.ERROR,
 | 
			
		||||
                        msg: `Invalid URL provided`,
 | 
			
		||||
                        data: {
 | 
			
		||||
                            url:    this.domain,
 | 
			
		||||
                            origin: instance.origin
 | 
			
		||||
                        },
 | 
			
		||||
                        err: null,
 | 
			
		||||
                        ts: new Date(),
 | 
			
		||||
                    });
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
            } catch ( err ) {
 | 
			
		||||
                this.logs.push({
 | 
			
		||||
                    type: LogType.ERROR,
 | 
			
		||||
                    msg:  `Failed to parse provided URL`,
 | 
			
		||||
                    data: null,
 | 
			
		||||
                    err:  (err as Error),
 | 
			
		||||
                    ts:   new Date(),
 | 
			
		||||
                });
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Return a failure
 | 
			
		||||
            return false;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    // #endregion
 | 
			
		||||
 | 
			
		||||
    // #region Public Functions
 | 
			
		||||
        /**
 | 
			
		||||
         * Get the details of a specific post
 | 
			
		||||
         * @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
 | 
			
		||||
         */
 | 
			
		||||
        public async getPostDetails( url: string ): Promise<Post | null> {
 | 
			
		||||
            return null;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        /**
 | 
			
		||||
         * Get a list of posts from the mentioned page
 | 
			
		||||
         * @param url 
 | 
			
		||||
         * @returns 
 | 
			
		||||
         */
 | 
			
		||||
        public async getPostsFromPage( url: string ): Promise<Array<string>> {
 | 
			
		||||
            return [];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        /**
 | 
			
		||||
         * Get a list of pages by starting to crawl from a specific page.
 | 
			
		||||
         * @param url The starting page, this will crawl as many pages as you mention
 | 
			
		||||
         * @param pageCount The number of pages to crawl
 | 
			
		||||
         */
 | 
			
		||||
        public async crawlPages( url: string, pageCount: number = 10 ): Promise<Array<string>> {
 | 
			
		||||
            return [];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    // #endregion
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,11 @@
 | 
			
		|||
import * as axiosPackage from 'axios';
 | 
			
		||||
const axios = axiosPackage.default;
 | 
			
		||||
 | 
			
		||||
export function getPageContents(url: string): Promise<axiosPackage.AxiosResponse<unknown, any>> {
 | 
			
		||||
    // Return the axios function's promise
 | 
			
		||||
    return axios.get(url, {
 | 
			
		||||
        headers: { 
 | 
			
		||||
            'User-Agent': 'Mozilla/5.0',
 | 
			
		||||
        }
 | 
			
		||||
    });
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -1,17 +1,169 @@
 | 
			
		|||
import {Post, Tag} from "../type/generic";
 | 
			
		||||
import {Scrapper} from "../class/Scrapper";
 | 
			
		||||
import {Post, Tag, LogEntry, LogType} from "../type/generic";
 | 
			
		||||
import {Scrapper}                     from "../class/Scrapper";
 | 
			
		||||
import {getPageContents}              from "../helper/requestManager";
 | 
			
		||||
import * as cheerio from 'cheerio';
 | 
			
		||||
 | 
			
		||||
class Rule34xxx extends Scrapper {
 | 
			
		||||
export class Rule34xxx extends Scrapper {
 | 
			
		||||
 | 
			
		||||
    constructor() {
 | 
			
		||||
        // Set the domain base of the current Scrapper as "rule34.xxx"
 | 
			
		||||
        super("https://rule34.xxx");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Get the details of a specific post
 | 
			
		||||
     * @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
 | 
			
		||||
     */
 | 
			
		||||
    public async getPostDetails( url: string ): Promise<Post | null> {
 | 
			
		||||
        
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
        // Check if the provided link is valid
 | 
			
		||||
        if ( !this.checkURLBase(url) ) {
 | 
			
		||||
            throw new Error(`Invalid url provided`);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Send out the request to grab the contents of the post
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        return null;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Get a list of posts from the mentioned page
 | 
			
		||||
     * @param url 
 | 
			
		||||
     * @returns 
 | 
			
		||||
     */
 | 
			
		||||
    public async getPostsFromPage( url: string ): Promise<Array<string>> {
 | 
			
		||||
 | 
			
		||||
        // Check if the provided link is valid
 | 
			
		||||
        if ( !this.checkURLBase(url) ) {
 | 
			
		||||
            throw new Error(`Invalid url provided`);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Initialize the page contents here
 | 
			
		||||
        let pageContents: string = null;
 | 
			
		||||
 | 
			
		||||
        // Send out the request to grab the contents of the post
 | 
			
		||||
        try {
 | 
			
		||||
            // Send out the initial Axios request to fetch the data from the page
 | 
			
		||||
            await getPageContents(url)
 | 
			
		||||
            .then( request => {
 | 
			
		||||
                if ( request.status < 200 || request.status > 299 ) {
 | 
			
		||||
                    this.logs.push({
 | 
			
		||||
                        msg: `Invalid response code[${request.status}]`, 
 | 
			
		||||
                        type: LogType.ERROR,
 | 
			
		||||
                        err:  null,
 | 
			
		||||
                        data: null,
 | 
			
		||||
                        ts: new Date()
 | 
			
		||||
                    });
 | 
			
		||||
                    throw new Error(`Invalid response code[${request.status}]`);
 | 
			
		||||
                }
 | 
			
		||||
        
 | 
			
		||||
                pageContents = (request.data as string);
 | 
			
		||||
            })
 | 
			
		||||
    
 | 
			
		||||
    
 | 
			
		||||
        } catch ( err ) {
 | 
			
		||||
            // "Handle" the error so that it's in the above .catch
 | 
			
		||||
            this.logs.push({
 | 
			
		||||
                msg: `[Error]::getPostsFromPage::`, 
 | 
			
		||||
                type: LogType.ERROR,
 | 
			
		||||
                err: (err as Error),
 | 
			
		||||
                data: null,
 | 
			
		||||
                ts: new Date()
 | 
			
		||||
            });
 | 
			
		||||
            throw err;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Process the page's posts with cheerio
 | 
			
		||||
        const $ = cheerio.load((pageContents as string));
 | 
			
		||||
 | 
			
		||||
        // Define the post List
 | 
			
		||||
        const postList: Array<string> = [];
 | 
			
		||||
 | 
			
		||||
        // Workaround I guess
 | 
			
		||||
        let self = this;
 | 
			
		||||
 | 
			
		||||
        // Go through all of the posts
 | 
			
		||||
        $(`.thumb`).each( function() {
 | 
			
		||||
            const href = $(this).find(`a`).attr(`href`);
 | 
			
		||||
            if ( `${href}`.length >= `index.php?page=post&s=view&id=`.length )
 | 
			
		||||
                postList.push(`${self.domain}/${href}`);
 | 
			
		||||
        });
 | 
			
		||||
 | 
			
		||||
        return postList;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    public async crawlPages( url: string, pageCount: number = 10, batchSize: number = 1 ): Promise<Array<string>> {
 | 
			
		||||
 | 
			
		||||
        // Check if the provided link is valid
 | 
			
		||||
        if ( !this.checkURLBase(url) ) {
 | 
			
		||||
            throw new Error(`Invalid url provided`);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // A list of all of the found pages
 | 
			
		||||
        let foundPages = new Array<string>();
 | 
			
		||||
        // The next url we are hitting
 | 
			
		||||
        let nextPage: string = url;
 | 
			
		||||
 | 
			
		||||
        // Go through as many pages as requested
 | 
			
		||||
        for ( let i = 0; i < pageCount; i++ ) {
 | 
			
		||||
 | 
			
		||||
            if ( this.verbose ) {
 | 
			
		||||
                console.log(`[${i+1}/${pageCount}]Crawling ${nextPage}`);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Initialize the page contents here
 | 
			
		||||
            let pageContents: string = null;
 | 
			
		||||
 | 
			
		||||
            // Try and find the button to the next page
 | 
			
		||||
            try {
 | 
			
		||||
                // Send out the initial Axios request to fetch the data from the page
 | 
			
		||||
                await getPageContents(nextPage)
 | 
			
		||||
                .then( request => {
 | 
			
		||||
                    if ( request.status < 200 || request.status > 299 ) {
 | 
			
		||||
                        this.logs.push({
 | 
			
		||||
                            msg: `Invalid response code[${request.status}]`, 
 | 
			
		||||
                            type: LogType.ERROR,
 | 
			
		||||
                            err:  null,
 | 
			
		||||
                            data: null,
 | 
			
		||||
                            ts: new Date()
 | 
			
		||||
                        });
 | 
			
		||||
                        throw new Error(`Invalid response code[${request.status}]`);
 | 
			
		||||
                    }
 | 
			
		||||
            
 | 
			
		||||
                    pageContents = (request.data as string);
 | 
			
		||||
                })
 | 
			
		||||
        
 | 
			
		||||
        
 | 
			
		||||
            } catch ( err ) {
 | 
			
		||||
                // "Handle" the error so that it's in the above .catch
 | 
			
		||||
                this.logs.push({
 | 
			
		||||
                    msg: `[Error]::getPostsFromPage::`, 
 | 
			
		||||
                    type: LogType.ERROR,
 | 
			
		||||
                    err: (err as Error),
 | 
			
		||||
                    data: null,
 | 
			
		||||
                    ts: new Date()
 | 
			
		||||
                });
 | 
			
		||||
                throw err;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Process the page's posts with cheerio
 | 
			
		||||
            const $ = cheerio.load((pageContents as string));
 | 
			
		||||
 | 
			
		||||
            // Add the current page we are on to the list
 | 
			
		||||
            foundPages.push(nextPage);
 | 
			
		||||
 | 
			
		||||
            const nextPageButton = $(`a[alt="next"]`);
 | 
			
		||||
            if ( nextPageButton.length > 0 ) {
 | 
			
		||||
                nextPage = `${this.domain}/` + nextPageButton.attr(`href`);
 | 
			
		||||
            } else {
 | 
			
		||||
                // Since we didn't find the proper button, skip this page.
 | 
			
		||||
                break;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Return the found pages
 | 
			
		||||
        return foundPages;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										26
									
								
								src/test.ts
								
								
								
								
							
							
						
						
									
										26
									
								
								src/test.ts
								
								
								
								
							| 
						 | 
				
			
			@ -1,2 +1,26 @@
 | 
			
		|||
// This is the test file for the library, different tests are ran in here.
 | 
			
		||||
console.log(`Working I guess`);
 | 
			
		||||
import {Rule34xxx} from "./module/rule34xxx";
 | 
			
		||||
import {Post} from "./type/generic";
 | 
			
		||||
 | 
			
		||||
( async () => {
 | 
			
		||||
    // Initialize the rule34 module
 | 
			
		||||
    const r34: Rule34xxx = new Rule34xxx();
 | 
			
		||||
    r34.verbose = true;
 | 
			
		||||
    
 | 
			
		||||
    // Run the get post Details function
 | 
			
		||||
    let postDetails: Array<string>;
 | 
			
		||||
    await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=thelorope`, 35)
 | 
			
		||||
    .then(  postData => {
 | 
			
		||||
        postDetails = postData;
 | 
			
		||||
    })
 | 
			
		||||
    .catch( err => { 
 | 
			
		||||
        console.log(err); 
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    // Display results
 | 
			
		||||
    console.log({
 | 
			
		||||
        logs: r34.logs, 
 | 
			
		||||
        result: postDetails
 | 
			
		||||
    });
 | 
			
		||||
})();
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,12 +14,12 @@ export interface Post {
 | 
			
		|||
    /**
 | 
			
		||||
     * URL to the original post link
 | 
			
		||||
     */
 | 
			
		||||
    url?: string,
 | 
			
		||||
    url: string,
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * A link to the full resolution image or video
 | 
			
		||||
     */
 | 
			
		||||
    contentURL: string,
 | 
			
		||||
    contentURL?: string,
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * The optional link for the source of the image
 | 
			
		||||
| 
						 | 
				
			
			@ -29,10 +29,23 @@ export interface Post {
 | 
			
		|||
    /**
 | 
			
		||||
     * A list of all of the tags the post has
 | 
			
		||||
     */
 | 
			
		||||
    tags: Array<Tag>,
 | 
			
		||||
    tags?: Array<Tag>,
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * The date of the post's creation
 | 
			
		||||
     */
 | 
			
		||||
    ts?: string,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export enum LogType {
 | 
			
		||||
    ERROR = `error`,
 | 
			
		||||
    INFO  = `info`,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export interface LogEntry {
 | 
			
		||||
    type:   LogType,
 | 
			
		||||
    msg:    string,
 | 
			
		||||
    data:   any,
 | 
			
		||||
    err:    null | Error,
 | 
			
		||||
    ts:     Date,
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue