Almost finished with the rule34.xxx module
+ rule34xxx GetPostDetails
This commit is contained in:
parent
2ed81fb668
commit
fe2372dc97
|
@ -1,6 +1,6 @@
|
||||||
import {Post, Tag, LogEntry, LogType} from "../type/generic";
|
import { Post, Tag, LogEntry, LogType } from "../type/generic";
|
||||||
import {Scrapper} from "../class/Scrapper";
|
import { Scrapper } from "../class/Scrapper";
|
||||||
import {getPageContents} from "../helper/requestManager";
|
import { getPageContents } from "../helper/requestManager";
|
||||||
import * as cheerio from 'cheerio';
|
import * as cheerio from 'cheerio';
|
||||||
|
|
||||||
export class Rule34xxx extends Scrapper {
|
export class Rule34xxx extends Scrapper {
|
||||||
|
@ -14,17 +14,127 @@ export class Rule34xxx extends Scrapper {
|
||||||
* Get the details of a specific post
|
* Get the details of a specific post
|
||||||
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
|
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
|
||||||
*/
|
*/
|
||||||
public async getPostDetails( url: string ): Promise<Post | null> {
|
public async getPostDetails(url: string): Promise<Post | null> {
|
||||||
|
|
||||||
// Check if the provided link is valid
|
// Check if the provided link is valid
|
||||||
if ( !this.checkURLBase(url) ) {
|
if (!this.checkURLBase(url)) {
|
||||||
throw new Error(`Invalid url provided`);
|
throw new Error(`Invalid url provided`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Init the variable here in case of an error
|
||||||
|
let pageContents = null;
|
||||||
|
|
||||||
// Send out the request to grab the contents of the post
|
// Send out the request to grab the contents of the post
|
||||||
|
try {
|
||||||
|
// Send out the initial Axios request to fetch the data from the page
|
||||||
|
pageContents = await getPageContents(url);
|
||||||
|
|
||||||
|
if (pageContents.status < 200 || pageContents.status > 299) {
|
||||||
|
throw new Error(`Invalid response code[${pageContents.status}]`);
|
||||||
|
}
|
||||||
|
|
||||||
return null;
|
pageContents = pageContents.data;
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
// "Handle" the error so that it's in the above .catch
|
||||||
|
if (this.verbose) {
|
||||||
|
console.error(`[Error]::getPostDetails::`);
|
||||||
|
}
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process the page's posts with cheerio
|
||||||
|
const $ = cheerio.load((pageContents as string));
|
||||||
|
|
||||||
|
const postTags: Array<Tag> = [];
|
||||||
|
{ // Get the post's tags
|
||||||
|
const tagsSection = $(`#tag-sidebar`);
|
||||||
|
if (tagsSection.length <= 0) {
|
||||||
|
throw new Error(`Failed to find post tags, invalid post`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run a query for all tags
|
||||||
|
tagsSection.find(`.tag`).each(function () {
|
||||||
|
// Go through the classes of the tag and see if we find the type
|
||||||
|
let tagType = "general";
|
||||||
|
const classList = $(this).attr(`class`).split(" ");
|
||||||
|
if (classList.length > 0) {
|
||||||
|
for (let tt of classList) {
|
||||||
|
if (tt.includes(`tag-type-`)) {
|
||||||
|
tagType = tt.split(`-type-`)[1];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const tagAnchor = $(this).find(`a`);
|
||||||
|
let tagName = `unknown`;
|
||||||
|
let tagSlug = `unknown`;
|
||||||
|
{ // Get the name of the tag and slug
|
||||||
|
if (tagAnchor.length > 0) {
|
||||||
|
tagName = tagAnchor.text();
|
||||||
|
tagSlug = tagAnchor.attr(`href`).split("tags=")[1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the tag to the postTags listing
|
||||||
|
postTags.push({
|
||||||
|
slug: tagSlug ?? `unknown`,
|
||||||
|
type: tagType ?? `general`
|
||||||
|
});
|
||||||
|
|
||||||
|
})
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
let postContent = `ERROR`;
|
||||||
|
{ // get the link to the post's original image/video
|
||||||
|
const imageLink = $(`meta[property="og:image"]`);
|
||||||
|
|
||||||
|
if (imageLink.length > 0) {
|
||||||
|
postContent = imageLink.attr(`content`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure the postContent isn't just a link back like they like to do and/or we didn't find anything
|
||||||
|
if (postContent == `ERROR` || postContent == url) {
|
||||||
|
// Get the current page's contnet
|
||||||
|
postContent = $(`#fit-to-screen img`).attr(`src`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (postContent.indexOf(`?`) >= 5) {
|
||||||
|
postContent = postContent.split(`?`)[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (postContent.indexOf(`//`) >= 0) {
|
||||||
|
postContent = postContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the source of the post
|
||||||
|
let postSource = null;
|
||||||
|
{
|
||||||
|
const sourceA = $(`#stats a[rel="nofollow"]`);
|
||||||
|
if (sourceA.length > 0) {
|
||||||
|
postSource = sourceA.attr(`href`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let postDate = "2021-10-17 13:18:27";
|
||||||
|
{
|
||||||
|
const postDateRef = $(`#stats li:nth-child(2)`);
|
||||||
|
if (postDateRef.length > 0) {
|
||||||
|
postDate = postDateRef.text().split("\n")[1].replace(/Posted: /g, '');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: url,
|
||||||
|
contentURL: postContent ?? "ERROR",
|
||||||
|
source: postSource,
|
||||||
|
tags: postTags ?? [],
|
||||||
|
ts: postDate,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -32,10 +142,10 @@ export class Rule34xxx extends Scrapper {
|
||||||
* @param url
|
* @param url
|
||||||
* @returns
|
* @returns
|
||||||
*/
|
*/
|
||||||
public async getPostsFromPage( url: string ): Promise<Array<string>> {
|
public async getPostsFromPage(url: string): Promise<Array<string>> {
|
||||||
|
|
||||||
// Check if the provided link is valid
|
// Check if the provided link is valid
|
||||||
if ( !this.checkURLBase(url) ) {
|
if (!this.checkURLBase(url)) {
|
||||||
throw new Error(`Invalid url provided`);
|
throw new Error(`Invalid url provided`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -46,8 +156,8 @@ export class Rule34xxx extends Scrapper {
|
||||||
try {
|
try {
|
||||||
// Send out the initial Axios request to fetch the data from the page
|
// Send out the initial Axios request to fetch the data from the page
|
||||||
await getPageContents(url)
|
await getPageContents(url)
|
||||||
.then( request => {
|
.then(request => {
|
||||||
if ( request.status < 200 || request.status > 299 ) {
|
if (request.status < 200 || request.status > 299) {
|
||||||
this.logs.push({
|
this.logs.push({
|
||||||
msg: `Invalid response code[${request.status}]`,
|
msg: `Invalid response code[${request.status}]`,
|
||||||
type: LogType.ERROR,
|
type: LogType.ERROR,
|
||||||
|
@ -62,7 +172,7 @@ export class Rule34xxx extends Scrapper {
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
} catch ( err ) {
|
} catch (err) {
|
||||||
// "Handle" the error so that it's in the above .catch
|
// "Handle" the error so that it's in the above .catch
|
||||||
this.logs.push({
|
this.logs.push({
|
||||||
msg: `[Error]::getPostsFromPage::`,
|
msg: `[Error]::getPostsFromPage::`,
|
||||||
|
@ -84,19 +194,19 @@ export class Rule34xxx extends Scrapper {
|
||||||
let self = this;
|
let self = this;
|
||||||
|
|
||||||
// Go through all of the posts
|
// Go through all of the posts
|
||||||
$(`.thumb`).each( function() {
|
$(`.thumb`).each(function () {
|
||||||
const href = $(this).find(`a`).attr(`href`);
|
const href = $(this).find(`a`).attr(`href`);
|
||||||
if ( `${href}`.length >= `index.php?page=post&s=view&id=`.length )
|
if (`${href}`.length >= `index.php?page=post&s=view&id=`.length)
|
||||||
postList.push(`${self.domain}/${href}`);
|
postList.push(`${self.domain}/${href}`);
|
||||||
});
|
});
|
||||||
|
|
||||||
return postList;
|
return postList;
|
||||||
}
|
}
|
||||||
|
|
||||||
public async crawlPages( url: string, pageCount: number = 10, batchSize: number = 1 ): Promise<Array<string>> {
|
public async crawlPages(url: string, pageCount: number = 10, batchSize: number = 1): Promise<Array<string>> {
|
||||||
|
|
||||||
// Check if the provided link is valid
|
// Check if the provided link is valid
|
||||||
if ( !this.checkURLBase(url) ) {
|
if (!this.checkURLBase(url)) {
|
||||||
throw new Error(`Invalid url provided`);
|
throw new Error(`Invalid url provided`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -106,10 +216,10 @@ export class Rule34xxx extends Scrapper {
|
||||||
let nextPage: string = url;
|
let nextPage: string = url;
|
||||||
|
|
||||||
// Go through as many pages as requested
|
// Go through as many pages as requested
|
||||||
for ( let i = 0; i < pageCount; i++ ) {
|
for (let i = 0; i < pageCount; i++) {
|
||||||
|
|
||||||
if ( this.verbose ) {
|
if (this.verbose) {
|
||||||
console.log(`[${i+1}/${pageCount}]Crawling ${nextPage}`);
|
console.log(`[${i + 1}/${pageCount}]Crawling ${nextPage}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize the page contents here
|
// Initialize the page contents here
|
||||||
|
@ -119,8 +229,8 @@ export class Rule34xxx extends Scrapper {
|
||||||
try {
|
try {
|
||||||
// Send out the initial Axios request to fetch the data from the page
|
// Send out the initial Axios request to fetch the data from the page
|
||||||
await getPageContents(nextPage)
|
await getPageContents(nextPage)
|
||||||
.then( request => {
|
.then(request => {
|
||||||
if ( request.status < 200 || request.status > 299 ) {
|
if (request.status < 200 || request.status > 299) {
|
||||||
this.logs.push({
|
this.logs.push({
|
||||||
msg: `Invalid response code[${request.status}]`,
|
msg: `Invalid response code[${request.status}]`,
|
||||||
type: LogType.ERROR,
|
type: LogType.ERROR,
|
||||||
|
@ -135,7 +245,7 @@ export class Rule34xxx extends Scrapper {
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
} catch ( err ) {
|
} catch (err) {
|
||||||
// "Handle" the error so that it's in the above .catch
|
// "Handle" the error so that it's in the above .catch
|
||||||
this.logs.push({
|
this.logs.push({
|
||||||
msg: `[Error]::getPostsFromPage::`,
|
msg: `[Error]::getPostsFromPage::`,
|
||||||
|
@ -154,7 +264,7 @@ export class Rule34xxx extends Scrapper {
|
||||||
foundPages.push(nextPage);
|
foundPages.push(nextPage);
|
||||||
|
|
||||||
const nextPageButton = $(`a[alt="next"]`);
|
const nextPageButton = $(`a[alt="next"]`);
|
||||||
if ( nextPageButton.length > 0 ) {
|
if (nextPageButton.length > 0) {
|
||||||
nextPage = `${this.domain}/` + nextPageButton.attr(`href`);
|
nextPage = `${this.domain}/` + nextPageButton.attr(`href`);
|
||||||
} else {
|
} else {
|
||||||
// Since we didn't find the proper button, skip this page.
|
// Since we didn't find the proper button, skip this page.
|
||||||
|
|
|
@ -8,8 +8,8 @@ import {Post} from "./type/generic";
|
||||||
r34.verbose = true;
|
r34.verbose = true;
|
||||||
|
|
||||||
// Run the get post Details function
|
// Run the get post Details function
|
||||||
let postDetails: Array<string>;
|
let postDetails: any;
|
||||||
await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=thelorope`, 35)
|
await r34.getPostDetails(`https://rule34.xxx/index.php?page=post&s=view&id=5203781`)
|
||||||
.then( postData => {
|
.then( postData => {
|
||||||
postDetails = postData;
|
postDetails = postData;
|
||||||
})
|
})
|
||||||
|
|
Reference in New Issue