+ Some more work on rule34.xxx

+ rule34xxx Get all posts from a page
+ rule34xxx Crawl pages
This commit is contained in:
Daniel Legt 2021-10-21 02:14:34 +03:00
parent 11224096f9
commit 2ed81fb668
5 changed files with 281 additions and 25 deletions

View File

@ -1,4 +1,4 @@
import {Post, Tag} from "../type/generic"; import {Post, Tag, LogEntry, LogType} from "../type/generic";
/** /**
* The base class of the scrappers, any of the website scrappers must extend this class * The base class of the scrappers, any of the website scrappers must extend this class
@ -13,29 +13,85 @@ export class Scrapper {
/** /**
* An array of all of the logs * An array of all of the logs
*/ */
public logs: Array<any> = []; public logs: Array<LogEntry> = [];
/** /**
* The fully qualified domain of the website to scrap, for example "rule34.life" * The fully qualified domain base without a trailing / of the website to scrap, for example "https://rule34.life"
*/ */
public domain: string = ``; public domain: string = ``;
/** /**
* Get the details of a specific post * Display console logs
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
*/ */
public async getPostDetails( url: string ): Promise<Post | null> { public verbose: boolean = false;
return null;
}
/** // #region Protected Functions
* Get a list of posts from the mentioned page
* @param url
* @returns
*/
public async getPostsFromPage( url: string ): Promise<Array<Post>> {
return [];
}
protected checkURLBase(url: string) {
try {
// Try and build a new URL class
const instance: URL = new URL(url);
// Check if the origin matches ours
if (instance.origin == this.domain) {
// Return success
return true;
} else {
this.logs.push({
type: LogType.ERROR,
msg: `Invalid URL provided`,
data: {
url: this.domain,
origin: instance.origin
},
err: null,
ts: new Date(),
});
}
} catch ( err ) {
this.logs.push({
type: LogType.ERROR,
msg: `Failed to parse provided URL`,
data: null,
err: (err as Error),
ts: new Date(),
});
}
// Return a failure
return false;
}
// #endregion
// #region Public Functions
/**
* Get the details of a specific post
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
*/
public async getPostDetails( url: string ): Promise<Post | null> {
return null;
}
/**
* Get a list of posts from the mentioned page
* @param url
* @returns
*/
public async getPostsFromPage( url: string ): Promise<Array<string>> {
return [];
}
/**
* Get a list of pages by starting to crawl from a specific page.
* @param url The starting page, this will crawl as many pages as you mention
* @param pageCount The number of pages to crawl
*/
public async crawlPages( url: string, pageCount: number = 10 ): Promise<Array<string>> {
return [];
}
// #endregion
} }

View File

@ -0,0 +1,11 @@
import * as axiosPackage from 'axios';
const axios = axiosPackage.default;
export function getPageContents(url: string): Promise<axiosPackage.AxiosResponse<unknown, any>> {
// Return the axios function's promise
return axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0',
}
});
}

View File

@ -1,17 +1,169 @@
import {Post, Tag} from "../type/generic"; import {Post, Tag, LogEntry, LogType} from "../type/generic";
import {Scrapper} from "../class/Scrapper"; import {Scrapper} from "../class/Scrapper";
import {getPageContents} from "../helper/requestManager";
import * as cheerio from 'cheerio';
class Rule34xxx extends Scrapper { export class Rule34xxx extends Scrapper {
constructor() {
// Set the domain base of the current Scrapper as "rule34.xxx"
super("https://rule34.xxx");
}
/** /**
* Get the details of a specific post * Get the details of a specific post
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc... * @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
*/ */
public async getPostDetails( url: string ): Promise<Post | null> { public async getPostDetails( url: string ): Promise<Post | null> {
// Check if the provided link is valid
if ( !this.checkURLBase(url) ) {
throw new Error(`Invalid url provided`);
}
// Send out the request to grab the contents of the post
return null; return null;
} }
/**
* Get a list of posts from the mentioned page
* @param url
* @returns
*/
public async getPostsFromPage( url: string ): Promise<Array<string>> {
// Check if the provided link is valid
if ( !this.checkURLBase(url) ) {
throw new Error(`Invalid url provided`);
}
// Initialize the page contents here
let pageContents: string = null;
// Send out the request to grab the contents of the post
try {
// Send out the initial Axios request to fetch the data from the page
await getPageContents(url)
.then( request => {
if ( request.status < 200 || request.status > 299 ) {
this.logs.push({
msg: `Invalid response code[${request.status}]`,
type: LogType.ERROR,
err: null,
data: null,
ts: new Date()
});
throw new Error(`Invalid response code[${request.status}]`);
}
pageContents = (request.data as string);
})
} catch ( err ) {
// "Handle" the error so that it's in the above .catch
this.logs.push({
msg: `[Error]::getPostsFromPage::`,
type: LogType.ERROR,
err: (err as Error),
data: null,
ts: new Date()
});
throw err;
}
// Process the page's posts with cheerio
const $ = cheerio.load((pageContents as string));
// Define the post List
const postList: Array<string> = [];
// Workaround I guess
let self = this;
// Go through all of the posts
$(`.thumb`).each( function() {
const href = $(this).find(`a`).attr(`href`);
if ( `${href}`.length >= `index.php?page=post&s=view&id=`.length )
postList.push(`${self.domain}/${href}`);
});
return postList;
}
public async crawlPages( url: string, pageCount: number = 10, batchSize: number = 1 ): Promise<Array<string>> {
// Check if the provided link is valid
if ( !this.checkURLBase(url) ) {
throw new Error(`Invalid url provided`);
}
// A list of all of the found pages
let foundPages = new Array<string>();
// The next url we are hitting
let nextPage: string = url;
// Go through as many pages as requested
for ( let i = 0; i < pageCount; i++ ) {
if ( this.verbose ) {
console.log(`[${i+1}/${pageCount}]Crawling ${nextPage}`);
}
// Initialize the page contents here
let pageContents: string = null;
// Try and find the button to the next page
try {
// Send out the initial Axios request to fetch the data from the page
await getPageContents(nextPage)
.then( request => {
if ( request.status < 200 || request.status > 299 ) {
this.logs.push({
msg: `Invalid response code[${request.status}]`,
type: LogType.ERROR,
err: null,
data: null,
ts: new Date()
});
throw new Error(`Invalid response code[${request.status}]`);
}
pageContents = (request.data as string);
})
} catch ( err ) {
// "Handle" the error so that it's in the above .catch
this.logs.push({
msg: `[Error]::getPostsFromPage::`,
type: LogType.ERROR,
err: (err as Error),
data: null,
ts: new Date()
});
throw err;
}
// Process the page's posts with cheerio
const $ = cheerio.load((pageContents as string));
// Add the current page we are on to the list
foundPages.push(nextPage);
const nextPageButton = $(`a[alt="next"]`);
if ( nextPageButton.length > 0 ) {
nextPage = `${this.domain}/` + nextPageButton.attr(`href`);
} else {
// Since we didn't find the proper button, skip this page.
break;
}
}
// Return the found pages
return foundPages;
}
} }

View File

@ -1,2 +1,26 @@
// This is the test file for the library, different tests are ran in here. // This is the test file for the library, different tests are ran in here.
console.log(`Working I guess`); import {Rule34xxx} from "./module/rule34xxx";
import {Post} from "./type/generic";
( async () => {
// Initialize the rule34 module
const r34: Rule34xxx = new Rule34xxx();
r34.verbose = true;
// Run the get post Details function
let postDetails: Array<string>;
await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=thelorope`, 35)
.then( postData => {
postDetails = postData;
})
.catch( err => {
console.log(err);
});
// Display results
console.log({
logs: r34.logs,
result: postDetails
});
})();

View File

@ -14,12 +14,12 @@ export interface Post {
/** /**
* URL to the original post link * URL to the original post link
*/ */
url?: string, url: string,
/** /**
* A link to the full resolution image or video * A link to the full resolution image or video
*/ */
contentURL: string, contentURL?: string,
/** /**
* The optional link for the source of the image * The optional link for the source of the image
@ -29,10 +29,23 @@ export interface Post {
/** /**
* A list of all of the tags the post has * A list of all of the tags the post has
*/ */
tags: Array<Tag>, tags?: Array<Tag>,
/** /**
* The date of the post's creation * The date of the post's creation
*/ */
ts?: string, ts?: string,
}
export enum LogType {
ERROR = `error`,
INFO = `info`,
}
export interface LogEntry {
type: LogType,
msg: string,
data: any,
err: null | Error,
ts: Date,
} }