Almost finished with the rule34.xxx module
+ rule34xxx GetPostDetails
This commit is contained in:
		
							parent
							
								
									2ed81fb668
								
							
						
					
					
						commit
						fe2372dc97
					
				| 
						 | 
				
			
			@ -1,6 +1,6 @@
 | 
			
		|||
import {Post, Tag, LogEntry, LogType} from "../type/generic";
 | 
			
		||||
import {Scrapper}                     from "../class/Scrapper";
 | 
			
		||||
import {getPageContents}              from "../helper/requestManager";
 | 
			
		||||
import { Post, Tag, LogEntry, LogType } from "../type/generic";
 | 
			
		||||
import { Scrapper } from "../class/Scrapper";
 | 
			
		||||
import { getPageContents } from "../helper/requestManager";
 | 
			
		||||
import * as cheerio from 'cheerio';
 | 
			
		||||
 | 
			
		||||
export class Rule34xxx extends Scrapper {
 | 
			
		||||
| 
						 | 
				
			
			@ -14,17 +14,127 @@ export class Rule34xxx extends Scrapper {
 | 
			
		|||
     * Get the details of a specific post
 | 
			
		||||
     * @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
 | 
			
		||||
     */
 | 
			
		||||
    public async getPostDetails( url: string ): Promise<Post | null> {
 | 
			
		||||
    public async getPostDetails(url: string): Promise<Post | null> {
 | 
			
		||||
 | 
			
		||||
        // Check if the provided link is valid
 | 
			
		||||
        if ( !this.checkURLBase(url) ) {
 | 
			
		||||
        if (!this.checkURLBase(url)) {
 | 
			
		||||
            throw new Error(`Invalid url provided`);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Init the variable here in case of an error
 | 
			
		||||
        let pageContents = null;
 | 
			
		||||
        
 | 
			
		||||
        // Send out the request to grab the contents of the post
 | 
			
		||||
        try {
 | 
			
		||||
            // Send out the initial Axios request to fetch the data from the page
 | 
			
		||||
            pageContents = await getPageContents(url);
 | 
			
		||||
 | 
			
		||||
            if (pageContents.status < 200 || pageContents.status > 299) {
 | 
			
		||||
                throw new Error(`Invalid response code[${pageContents.status}]`);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        return null;
 | 
			
		||||
            pageContents = pageContents.data;
 | 
			
		||||
 | 
			
		||||
        } catch (err) {
 | 
			
		||||
            // "Handle" the error so that it's in the above .catch
 | 
			
		||||
            if (this.verbose) {
 | 
			
		||||
                console.error(`[Error]::getPostDetails::`);
 | 
			
		||||
            }
 | 
			
		||||
            throw err;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Process the page's posts with cheerio
 | 
			
		||||
        const $ = cheerio.load((pageContents as string));
 | 
			
		||||
 | 
			
		||||
        const postTags: Array<Tag> = [];
 | 
			
		||||
        { // Get the post's tags
 | 
			
		||||
            const tagsSection = $(`#tag-sidebar`);
 | 
			
		||||
            if (tagsSection.length <= 0) {
 | 
			
		||||
                throw new Error(`Failed to find post tags, invalid post`);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Run a query for all tags
 | 
			
		||||
            tagsSection.find(`.tag`).each(function () {
 | 
			
		||||
                // Go through the classes of the tag and see if we find the type
 | 
			
		||||
                let tagType = "general";
 | 
			
		||||
                const classList = $(this).attr(`class`).split(" ");
 | 
			
		||||
                if (classList.length > 0) {
 | 
			
		||||
                    for (let tt of classList) {
 | 
			
		||||
                        if (tt.includes(`tag-type-`)) {
 | 
			
		||||
                            tagType = tt.split(`-type-`)[1];
 | 
			
		||||
                            break;
 | 
			
		||||
                        }
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                const tagAnchor = $(this).find(`a`);
 | 
			
		||||
                let tagName = `unknown`;
 | 
			
		||||
                let tagSlug = `unknown`;
 | 
			
		||||
                { // Get the name of the tag and slug
 | 
			
		||||
                    if (tagAnchor.length > 0) {
 | 
			
		||||
                        tagName = tagAnchor.text();
 | 
			
		||||
                        tagSlug = tagAnchor.attr(`href`).split("tags=")[1];
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                // Add the tag to the postTags listing
 | 
			
		||||
                postTags.push({
 | 
			
		||||
                    slug: tagSlug ?? `unknown`,
 | 
			
		||||
                    type: tagType ?? `general`
 | 
			
		||||
                });
 | 
			
		||||
 | 
			
		||||
            })
 | 
			
		||||
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        let postContent = `ERROR`;
 | 
			
		||||
        { // get the link to the post's original image/video
 | 
			
		||||
            const imageLink = $(`meta[property="og:image"]`);
 | 
			
		||||
 | 
			
		||||
            if (imageLink.length > 0) {
 | 
			
		||||
                postContent = imageLink.attr(`content`);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Make sure the postContent isn't just a link back like they like to do and/or we didn't find anything
 | 
			
		||||
            if (postContent == `ERROR` || postContent == url) {
 | 
			
		||||
                // Get the current page's contnet
 | 
			
		||||
                postContent = $(`#fit-to-screen img`).attr(`src`);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            if (postContent.indexOf(`?`) >= 5) {
 | 
			
		||||
                postContent = postContent.split(`?`)[0];
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            if (postContent.indexOf(`//`) >= 0) {
 | 
			
		||||
                postContent = postContent;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Get the source of the post
 | 
			
		||||
        let postSource = null;
 | 
			
		||||
        {
 | 
			
		||||
            const sourceA = $(`#stats a[rel="nofollow"]`);
 | 
			
		||||
            if (sourceA.length > 0) {
 | 
			
		||||
                postSource = sourceA.attr(`href`);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        let postDate = "2021-10-17 13:18:27";
 | 
			
		||||
        {
 | 
			
		||||
            const postDateRef = $(`#stats li:nth-child(2)`);
 | 
			
		||||
            if (postDateRef.length > 0) {
 | 
			
		||||
                postDate = postDateRef.text().split("\n")[1].replace(/Posted: /g, '');
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        return {
 | 
			
		||||
            url: url,
 | 
			
		||||
            contentURL: postContent ?? "ERROR",
 | 
			
		||||
            source: postSource,
 | 
			
		||||
            tags: postTags ?? [],
 | 
			
		||||
            ts: postDate,
 | 
			
		||||
        };
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
| 
						 | 
				
			
			@ -32,10 +142,10 @@ export class Rule34xxx extends Scrapper {
 | 
			
		|||
     * @param url 
 | 
			
		||||
     * @returns 
 | 
			
		||||
     */
 | 
			
		||||
    public async getPostsFromPage( url: string ): Promise<Array<string>> {
 | 
			
		||||
    public async getPostsFromPage(url: string): Promise<Array<string>> {
 | 
			
		||||
 | 
			
		||||
        // Check if the provided link is valid
 | 
			
		||||
        if ( !this.checkURLBase(url) ) {
 | 
			
		||||
        if (!this.checkURLBase(url)) {
 | 
			
		||||
            throw new Error(`Invalid url provided`);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -46,8 +156,8 @@ export class Rule34xxx extends Scrapper {
 | 
			
		|||
        try {
 | 
			
		||||
            // Send out the initial Axios request to fetch the data from the page
 | 
			
		||||
            await getPageContents(url)
 | 
			
		||||
            .then( request => {
 | 
			
		||||
                if ( request.status < 200 || request.status > 299 ) {
 | 
			
		||||
                .then(request => {
 | 
			
		||||
                    if (request.status < 200 || request.status > 299) {
 | 
			
		||||
                        this.logs.push({
 | 
			
		||||
                            msg: `Invalid response code[${request.status}]`,
 | 
			
		||||
                            type: LogType.ERROR,
 | 
			
		||||
| 
						 | 
				
			
			@ -62,7 +172,7 @@ export class Rule34xxx extends Scrapper {
 | 
			
		|||
                })
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        } catch ( err ) {
 | 
			
		||||
        } catch (err) {
 | 
			
		||||
            // "Handle" the error so that it's in the above .catch
 | 
			
		||||
            this.logs.push({
 | 
			
		||||
                msg: `[Error]::getPostsFromPage::`,
 | 
			
		||||
| 
						 | 
				
			
			@ -84,19 +194,19 @@ export class Rule34xxx extends Scrapper {
 | 
			
		|||
        let self = this;
 | 
			
		||||
 | 
			
		||||
        // Go through all of the posts
 | 
			
		||||
        $(`.thumb`).each( function() {
 | 
			
		||||
        $(`.thumb`).each(function () {
 | 
			
		||||
            const href = $(this).find(`a`).attr(`href`);
 | 
			
		||||
            if ( `${href}`.length >= `index.php?page=post&s=view&id=`.length )
 | 
			
		||||
            if (`${href}`.length >= `index.php?page=post&s=view&id=`.length)
 | 
			
		||||
                postList.push(`${self.domain}/${href}`);
 | 
			
		||||
        });
 | 
			
		||||
 | 
			
		||||
        return postList;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    public async crawlPages( url: string, pageCount: number = 10, batchSize: number = 1 ): Promise<Array<string>> {
 | 
			
		||||
    public async crawlPages(url: string, pageCount: number = 10, batchSize: number = 1): Promise<Array<string>> {
 | 
			
		||||
 | 
			
		||||
        // Check if the provided link is valid
 | 
			
		||||
        if ( !this.checkURLBase(url) ) {
 | 
			
		||||
        if (!this.checkURLBase(url)) {
 | 
			
		||||
            throw new Error(`Invalid url provided`);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -106,10 +216,10 @@ export class Rule34xxx extends Scrapper {
 | 
			
		|||
        let nextPage: string = url;
 | 
			
		||||
 | 
			
		||||
        // Go through as many pages as requested
 | 
			
		||||
        for ( let i = 0; i < pageCount; i++ ) {
 | 
			
		||||
        for (let i = 0; i < pageCount; i++) {
 | 
			
		||||
 | 
			
		||||
            if ( this.verbose ) {
 | 
			
		||||
                console.log(`[${i+1}/${pageCount}]Crawling ${nextPage}`);
 | 
			
		||||
            if (this.verbose) {
 | 
			
		||||
                console.log(`[${i + 1}/${pageCount}]Crawling ${nextPage}`);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Initialize the page contents here
 | 
			
		||||
| 
						 | 
				
			
			@ -119,8 +229,8 @@ export class Rule34xxx extends Scrapper {
 | 
			
		|||
            try {
 | 
			
		||||
                // Send out the initial Axios request to fetch the data from the page
 | 
			
		||||
                await getPageContents(nextPage)
 | 
			
		||||
                .then( request => {
 | 
			
		||||
                    if ( request.status < 200 || request.status > 299 ) {
 | 
			
		||||
                    .then(request => {
 | 
			
		||||
                        if (request.status < 200 || request.status > 299) {
 | 
			
		||||
                            this.logs.push({
 | 
			
		||||
                                msg: `Invalid response code[${request.status}]`,
 | 
			
		||||
                                type: LogType.ERROR,
 | 
			
		||||
| 
						 | 
				
			
			@ -135,7 +245,7 @@ export class Rule34xxx extends Scrapper {
 | 
			
		|||
                    })
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
            } catch ( err ) {
 | 
			
		||||
            } catch (err) {
 | 
			
		||||
                // "Handle" the error so that it's in the above .catch
 | 
			
		||||
                this.logs.push({
 | 
			
		||||
                    msg: `[Error]::getPostsFromPage::`,
 | 
			
		||||
| 
						 | 
				
			
			@ -154,7 +264,7 @@ export class Rule34xxx extends Scrapper {
 | 
			
		|||
            foundPages.push(nextPage);
 | 
			
		||||
 | 
			
		||||
            const nextPageButton = $(`a[alt="next"]`);
 | 
			
		||||
            if ( nextPageButton.length > 0 ) {
 | 
			
		||||
            if (nextPageButton.length > 0) {
 | 
			
		||||
                nextPage = `${this.domain}/` + nextPageButton.attr(`href`);
 | 
			
		||||
            } else {
 | 
			
		||||
                // Since we didn't find the proper button, skip this page.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,8 +8,8 @@ import {Post} from "./type/generic";
 | 
			
		|||
    r34.verbose = true;
 | 
			
		||||
    
 | 
			
		||||
    // Run the get post Details function
 | 
			
		||||
    let postDetails: Array<string>;
 | 
			
		||||
    await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=thelorope`, 35)
 | 
			
		||||
    let postDetails: any;
 | 
			
		||||
    await r34.getPostDetails(`https://rule34.xxx/index.php?page=post&s=view&id=5203781`)
 | 
			
		||||
    .then(  postData => {
 | 
			
		||||
        postDetails = postData;
 | 
			
		||||
    })
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Reference in New Issue