diff --git a/src/module/rule34xxx.ts b/src/module/rule34xxx.ts index 7922101..be0d6d6 100644 --- a/src/module/rule34xxx.ts +++ b/src/module/rule34xxx.ts @@ -1,6 +1,6 @@ -import {Post, Tag, LogEntry, LogType} from "../type/generic"; -import {Scrapper} from "../class/Scrapper"; -import {getPageContents} from "../helper/requestManager"; +import { Post, Tag, LogEntry, LogType } from "../type/generic"; +import { Scrapper } from "../class/Scrapper"; +import { getPageContents } from "../helper/requestManager"; import * as cheerio from 'cheerio'; export class Rule34xxx extends Scrapper { @@ -14,17 +14,127 @@ export class Rule34xxx extends Scrapper { * Get the details of a specific post * @param url The URL to the post, this must be the actual page which contains the image, tags, etc... */ - public async getPostDetails( url: string ): Promise { + public async getPostDetails(url: string): Promise { // Check if the provided link is valid - if ( !this.checkURLBase(url) ) { + if (!this.checkURLBase(url)) { throw new Error(`Invalid url provided`); } + // Init the variable here in case of an error + let pageContents = null; + // Send out the request to grab the contents of the post + try { + // Send out the initial Axios request to fetch the data from the page + pageContents = await getPageContents(url); + if (pageContents.status < 200 || pageContents.status > 299) { + throw new Error(`Invalid response code[${pageContents.status}]`); + } - return null; + pageContents = pageContents.data; + + } catch (err) { + // "Handle" the error so that it's in the above .catch + if (this.verbose) { + console.error(`[Error]::getPostDetails::`); + } + throw err; + } + + // Process the page's posts with cheerio + const $ = cheerio.load((pageContents as string)); + + const postTags: Array = []; + { // Get the post's tags + const tagsSection = $(`#tag-sidebar`); + if (tagsSection.length <= 0) { + throw new Error(`Failed to find post tags, invalid post`); + } + + // Run a query for all tags + tagsSection.find(`.tag`).each(function () { + // Go through the classes of the tag and see if we find the type + let tagType = "general"; + const classList = $(this).attr(`class`).split(" "); + if (classList.length > 0) { + for (let tt of classList) { + if (tt.includes(`tag-type-`)) { + tagType = tt.split(`-type-`)[1]; + break; + } + } + } + + const tagAnchor = $(this).find(`a`); + let tagName = `unknown`; + let tagSlug = `unknown`; + { // Get the name of the tag and slug + if (tagAnchor.length > 0) { + tagName = tagAnchor.text(); + tagSlug = tagAnchor.attr(`href`).split("tags=")[1]; + } + } + + // Add the tag to the postTags listing + postTags.push({ + slug: tagSlug ?? `unknown`, + type: tagType ?? `general` + }); + + }) + + } + + let postContent = `ERROR`; + { // get the link to the post's original image/video + const imageLink = $(`meta[property="og:image"]`); + + if (imageLink.length > 0) { + postContent = imageLink.attr(`content`); + } + + // Make sure the postContent isn't just a link back like they like to do and/or we didn't find anything + if (postContent == `ERROR` || postContent == url) { + // Get the current page's contnet + postContent = $(`#fit-to-screen img`).attr(`src`); + } + + if (postContent.indexOf(`?`) >= 5) { + postContent = postContent.split(`?`)[0]; + } + + if (postContent.indexOf(`//`) >= 0) { + postContent = postContent; + } + + } + + // Get the source of the post + let postSource = null; + { + const sourceA = $(`#stats a[rel="nofollow"]`); + if (sourceA.length > 0) { + postSource = sourceA.attr(`href`); + } + } + + let postDate = "2021-10-17 13:18:27"; + { + const postDateRef = $(`#stats li:nth-child(2)`); + if (postDateRef.length > 0) { + postDate = postDateRef.text().split("\n")[1].replace(/Posted: /g, ''); + } + } + + return { + url: url, + contentURL: postContent ?? "ERROR", + source: postSource, + tags: postTags ?? [], + ts: postDate, + }; } /** @@ -32,10 +142,10 @@ export class Rule34xxx extends Scrapper { * @param url * @returns */ - public async getPostsFromPage( url: string ): Promise> { + public async getPostsFromPage(url: string): Promise> { // Check if the provided link is valid - if ( !this.checkURLBase(url) ) { + if (!this.checkURLBase(url)) { throw new Error(`Invalid url provided`); } @@ -46,26 +156,26 @@ export class Rule34xxx extends Scrapper { try { // Send out the initial Axios request to fetch the data from the page await getPageContents(url) - .then( request => { - if ( request.status < 200 || request.status > 299 ) { - this.logs.push({ - msg: `Invalid response code[${request.status}]`, - type: LogType.ERROR, - err: null, - data: null, - ts: new Date() - }); - throw new Error(`Invalid response code[${request.status}]`); - } - - pageContents = (request.data as string); - }) - - - } catch ( err ) { + .then(request => { + if (request.status < 200 || request.status > 299) { + this.logs.push({ + msg: `Invalid response code[${request.status}]`, + type: LogType.ERROR, + err: null, + data: null, + ts: new Date() + }); + throw new Error(`Invalid response code[${request.status}]`); + } + + pageContents = (request.data as string); + }) + + + } catch (err) { // "Handle" the error so that it's in the above .catch this.logs.push({ - msg: `[Error]::getPostsFromPage::`, + msg: `[Error]::getPostsFromPage::`, type: LogType.ERROR, err: (err as Error), data: null, @@ -84,19 +194,19 @@ export class Rule34xxx extends Scrapper { let self = this; // Go through all of the posts - $(`.thumb`).each( function() { + $(`.thumb`).each(function () { const href = $(this).find(`a`).attr(`href`); - if ( `${href}`.length >= `index.php?page=post&s=view&id=`.length ) + if (`${href}`.length >= `index.php?page=post&s=view&id=`.length) postList.push(`${self.domain}/${href}`); }); return postList; } - public async crawlPages( url: string, pageCount: number = 10, batchSize: number = 1 ): Promise> { + public async crawlPages(url: string, pageCount: number = 10, batchSize: number = 1): Promise> { // Check if the provided link is valid - if ( !this.checkURLBase(url) ) { + if (!this.checkURLBase(url)) { throw new Error(`Invalid url provided`); } @@ -106,10 +216,10 @@ export class Rule34xxx extends Scrapper { let nextPage: string = url; // Go through as many pages as requested - for ( let i = 0; i < pageCount; i++ ) { + for (let i = 0; i < pageCount; i++) { - if ( this.verbose ) { - console.log(`[${i+1}/${pageCount}]Crawling ${nextPage}`); + if (this.verbose) { + console.log(`[${i + 1}/${pageCount}]Crawling ${nextPage}`); } // Initialize the page contents here @@ -119,26 +229,26 @@ export class Rule34xxx extends Scrapper { try { // Send out the initial Axios request to fetch the data from the page await getPageContents(nextPage) - .then( request => { - if ( request.status < 200 || request.status > 299 ) { - this.logs.push({ - msg: `Invalid response code[${request.status}]`, - type: LogType.ERROR, - err: null, - data: null, - ts: new Date() - }); - throw new Error(`Invalid response code[${request.status}]`); - } - - pageContents = (request.data as string); - }) - - - } catch ( err ) { + .then(request => { + if (request.status < 200 || request.status > 299) { + this.logs.push({ + msg: `Invalid response code[${request.status}]`, + type: LogType.ERROR, + err: null, + data: null, + ts: new Date() + }); + throw new Error(`Invalid response code[${request.status}]`); + } + + pageContents = (request.data as string); + }) + + + } catch (err) { // "Handle" the error so that it's in the above .catch this.logs.push({ - msg: `[Error]::getPostsFromPage::`, + msg: `[Error]::getPostsFromPage::`, type: LogType.ERROR, err: (err as Error), data: null, @@ -154,7 +264,7 @@ export class Rule34xxx extends Scrapper { foundPages.push(nextPage); const nextPageButton = $(`a[alt="next"]`); - if ( nextPageButton.length > 0 ) { + if (nextPageButton.length > 0) { nextPage = `${this.domain}/` + nextPageButton.attr(`href`); } else { // Since we didn't find the proper button, skip this page. diff --git a/src/test.ts b/src/test.ts index eae0a3b..8ba10df 100644 --- a/src/test.ts +++ b/src/test.ts @@ -8,8 +8,8 @@ import {Post} from "./type/generic"; r34.verbose = true; // Run the get post Details function - let postDetails: Array; - await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=thelorope`, 35) + let postDetails: any; + await r34.getPostDetails(`https://rule34.xxx/index.php?page=post&s=view&id=5203781`) .then( postData => { postDetails = postData; })