Almost finished with the rule34.xxx module

+ rule34xxx GetPostDetails
This commit is contained in:
Daniel Legt 2021-10-21 02:30:42 +03:00
parent 2ed81fb668
commit fe2372dc97
2 changed files with 164 additions and 54 deletions

View File

@ -1,6 +1,6 @@
import {Post, Tag, LogEntry, LogType} from "../type/generic"; import { Post, Tag, LogEntry, LogType } from "../type/generic";
import {Scrapper} from "../class/Scrapper"; import { Scrapper } from "../class/Scrapper";
import {getPageContents} from "../helper/requestManager"; import { getPageContents } from "../helper/requestManager";
import * as cheerio from 'cheerio'; import * as cheerio from 'cheerio';
export class Rule34xxx extends Scrapper { export class Rule34xxx extends Scrapper {
@ -14,17 +14,127 @@ export class Rule34xxx extends Scrapper {
* Get the details of a specific post * Get the details of a specific post
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc... * @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
*/ */
public async getPostDetails( url: string ): Promise<Post | null> { public async getPostDetails(url: string): Promise<Post | null> {
// Check if the provided link is valid // Check if the provided link is valid
if ( !this.checkURLBase(url) ) { if (!this.checkURLBase(url)) {
throw new Error(`Invalid url provided`); throw new Error(`Invalid url provided`);
} }
// Init the variable here in case of an error
let pageContents = null;
// Send out the request to grab the contents of the post // Send out the request to grab the contents of the post
try {
// Send out the initial Axios request to fetch the data from the page
pageContents = await getPageContents(url);
if (pageContents.status < 200 || pageContents.status > 299) {
throw new Error(`Invalid response code[${pageContents.status}]`);
}
return null; pageContents = pageContents.data;
} catch (err) {
// "Handle" the error so that it's in the above .catch
if (this.verbose) {
console.error(`[Error]::getPostDetails::`);
}
throw err;
}
// Process the page's posts with cheerio
const $ = cheerio.load((pageContents as string));
const postTags: Array<Tag> = [];
{ // Get the post's tags
const tagsSection = $(`#tag-sidebar`);
if (tagsSection.length <= 0) {
throw new Error(`Failed to find post tags, invalid post`);
}
// Run a query for all tags
tagsSection.find(`.tag`).each(function () {
// Go through the classes of the tag and see if we find the type
let tagType = "general";
const classList = $(this).attr(`class`).split(" ");
if (classList.length > 0) {
for (let tt of classList) {
if (tt.includes(`tag-type-`)) {
tagType = tt.split(`-type-`)[1];
break;
}
}
}
const tagAnchor = $(this).find(`a`);
let tagName = `unknown`;
let tagSlug = `unknown`;
{ // Get the name of the tag and slug
if (tagAnchor.length > 0) {
tagName = tagAnchor.text();
tagSlug = tagAnchor.attr(`href`).split("tags=")[1];
}
}
// Add the tag to the postTags listing
postTags.push({
slug: tagSlug ?? `unknown`,
type: tagType ?? `general`
});
})
}
let postContent = `ERROR`;
{ // get the link to the post's original image/video
const imageLink = $(`meta[property="og:image"]`);
if (imageLink.length > 0) {
postContent = imageLink.attr(`content`);
}
// Make sure the postContent isn't just a link back like they like to do and/or we didn't find anything
if (postContent == `ERROR` || postContent == url) {
// Get the current page's contnet
postContent = $(`#fit-to-screen img`).attr(`src`);
}
if (postContent.indexOf(`?`) >= 5) {
postContent = postContent.split(`?`)[0];
}
if (postContent.indexOf(`//`) >= 0) {
postContent = postContent;
}
}
// Get the source of the post
let postSource = null;
{
const sourceA = $(`#stats a[rel="nofollow"]`);
if (sourceA.length > 0) {
postSource = sourceA.attr(`href`);
}
}
let postDate = "2021-10-17 13:18:27";
{
const postDateRef = $(`#stats li:nth-child(2)`);
if (postDateRef.length > 0) {
postDate = postDateRef.text().split("\n")[1].replace(/Posted: /g, '');
}
}
return {
url: url,
contentURL: postContent ?? "ERROR",
source: postSource,
tags: postTags ?? [],
ts: postDate,
};
} }
/** /**
@ -32,10 +142,10 @@ export class Rule34xxx extends Scrapper {
* @param url * @param url
* @returns * @returns
*/ */
public async getPostsFromPage( url: string ): Promise<Array<string>> { public async getPostsFromPage(url: string): Promise<Array<string>> {
// Check if the provided link is valid // Check if the provided link is valid
if ( !this.checkURLBase(url) ) { if (!this.checkURLBase(url)) {
throw new Error(`Invalid url provided`); throw new Error(`Invalid url provided`);
} }
@ -46,23 +156,23 @@ export class Rule34xxx extends Scrapper {
try { try {
// Send out the initial Axios request to fetch the data from the page // Send out the initial Axios request to fetch the data from the page
await getPageContents(url) await getPageContents(url)
.then( request => { .then(request => {
if ( request.status < 200 || request.status > 299 ) { if (request.status < 200 || request.status > 299) {
this.logs.push({ this.logs.push({
msg: `Invalid response code[${request.status}]`, msg: `Invalid response code[${request.status}]`,
type: LogType.ERROR, type: LogType.ERROR,
err: null, err: null,
data: null, data: null,
ts: new Date() ts: new Date()
}); });
throw new Error(`Invalid response code[${request.status}]`); throw new Error(`Invalid response code[${request.status}]`);
} }
pageContents = (request.data as string); pageContents = (request.data as string);
}) })
} catch ( err ) { } catch (err) {
// "Handle" the error so that it's in the above .catch // "Handle" the error so that it's in the above .catch
this.logs.push({ this.logs.push({
msg: `[Error]::getPostsFromPage::`, msg: `[Error]::getPostsFromPage::`,
@ -84,19 +194,19 @@ export class Rule34xxx extends Scrapper {
let self = this; let self = this;
// Go through all of the posts // Go through all of the posts
$(`.thumb`).each( function() { $(`.thumb`).each(function () {
const href = $(this).find(`a`).attr(`href`); const href = $(this).find(`a`).attr(`href`);
if ( `${href}`.length >= `index.php?page=post&s=view&id=`.length ) if (`${href}`.length >= `index.php?page=post&s=view&id=`.length)
postList.push(`${self.domain}/${href}`); postList.push(`${self.domain}/${href}`);
}); });
return postList; return postList;
} }
public async crawlPages( url: string, pageCount: number = 10, batchSize: number = 1 ): Promise<Array<string>> { public async crawlPages(url: string, pageCount: number = 10, batchSize: number = 1): Promise<Array<string>> {
// Check if the provided link is valid // Check if the provided link is valid
if ( !this.checkURLBase(url) ) { if (!this.checkURLBase(url)) {
throw new Error(`Invalid url provided`); throw new Error(`Invalid url provided`);
} }
@ -106,10 +216,10 @@ export class Rule34xxx extends Scrapper {
let nextPage: string = url; let nextPage: string = url;
// Go through as many pages as requested // Go through as many pages as requested
for ( let i = 0; i < pageCount; i++ ) { for (let i = 0; i < pageCount; i++) {
if ( this.verbose ) { if (this.verbose) {
console.log(`[${i+1}/${pageCount}]Crawling ${nextPage}`); console.log(`[${i + 1}/${pageCount}]Crawling ${nextPage}`);
} }
// Initialize the page contents here // Initialize the page contents here
@ -119,23 +229,23 @@ export class Rule34xxx extends Scrapper {
try { try {
// Send out the initial Axios request to fetch the data from the page // Send out the initial Axios request to fetch the data from the page
await getPageContents(nextPage) await getPageContents(nextPage)
.then( request => { .then(request => {
if ( request.status < 200 || request.status > 299 ) { if (request.status < 200 || request.status > 299) {
this.logs.push({ this.logs.push({
msg: `Invalid response code[${request.status}]`, msg: `Invalid response code[${request.status}]`,
type: LogType.ERROR, type: LogType.ERROR,
err: null, err: null,
data: null, data: null,
ts: new Date() ts: new Date()
}); });
throw new Error(`Invalid response code[${request.status}]`); throw new Error(`Invalid response code[${request.status}]`);
} }
pageContents = (request.data as string); pageContents = (request.data as string);
}) })
} catch ( err ) { } catch (err) {
// "Handle" the error so that it's in the above .catch // "Handle" the error so that it's in the above .catch
this.logs.push({ this.logs.push({
msg: `[Error]::getPostsFromPage::`, msg: `[Error]::getPostsFromPage::`,
@ -154,7 +264,7 @@ export class Rule34xxx extends Scrapper {
foundPages.push(nextPage); foundPages.push(nextPage);
const nextPageButton = $(`a[alt="next"]`); const nextPageButton = $(`a[alt="next"]`);
if ( nextPageButton.length > 0 ) { if (nextPageButton.length > 0) {
nextPage = `${this.domain}/` + nextPageButton.attr(`href`); nextPage = `${this.domain}/` + nextPageButton.attr(`href`);
} else { } else {
// Since we didn't find the proper button, skip this page. // Since we didn't find the proper button, skip this page.

View File

@ -8,8 +8,8 @@ import {Post} from "./type/generic";
r34.verbose = true; r34.verbose = true;
// Run the get post Details function // Run the get post Details function
let postDetails: Array<string>; let postDetails: any;
await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=thelorope`, 35) await r34.getPostDetails(`https://rule34.xxx/index.php?page=post&s=view&id=5203781`)
.then( postData => { .then( postData => {
postDetails = postData; postDetails = postData;
}) })