+ Some more work on rule34.xxx
+ rule34xxx Get all posts from a page + rule34xxx Crawl pages
This commit is contained in:
parent
11224096f9
commit
2ed81fb668
|
@ -1,4 +1,4 @@
|
||||||
import {Post, Tag} from "../type/generic";
|
import {Post, Tag, LogEntry, LogType} from "../type/generic";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The base class of the scrappers, any of the website scrappers must extend this class
|
* The base class of the scrappers, any of the website scrappers must extend this class
|
||||||
|
@ -13,29 +13,85 @@ export class Scrapper {
|
||||||
/**
|
/**
|
||||||
* An array of all of the logs
|
* An array of all of the logs
|
||||||
*/
|
*/
|
||||||
public logs: Array<any> = [];
|
public logs: Array<LogEntry> = [];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The fully qualified domain of the website to scrap, for example "rule34.life"
|
* The fully qualified domain base without a trailing / of the website to scrap, for example "https://rule34.life"
|
||||||
*/
|
*/
|
||||||
public domain: string = ``;
|
public domain: string = ``;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the details of a specific post
|
* Display console logs
|
||||||
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
|
|
||||||
*/
|
*/
|
||||||
public async getPostDetails( url: string ): Promise<Post | null> {
|
public verbose: boolean = false;
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
// #region Protected Functions
|
||||||
* Get a list of posts from the mentioned page
|
|
||||||
* @param url
|
|
||||||
* @returns
|
|
||||||
*/
|
|
||||||
public async getPostsFromPage( url: string ): Promise<Array<Post>> {
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
protected checkURLBase(url: string) {
|
||||||
|
try {
|
||||||
|
// Try and build a new URL class
|
||||||
|
const instance: URL = new URL(url);
|
||||||
|
|
||||||
|
// Check if the origin matches ours
|
||||||
|
if (instance.origin == this.domain) {
|
||||||
|
// Return success
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
this.logs.push({
|
||||||
|
type: LogType.ERROR,
|
||||||
|
msg: `Invalid URL provided`,
|
||||||
|
data: {
|
||||||
|
url: this.domain,
|
||||||
|
origin: instance.origin
|
||||||
|
},
|
||||||
|
err: null,
|
||||||
|
ts: new Date(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch ( err ) {
|
||||||
|
this.logs.push({
|
||||||
|
type: LogType.ERROR,
|
||||||
|
msg: `Failed to parse provided URL`,
|
||||||
|
data: null,
|
||||||
|
err: (err as Error),
|
||||||
|
ts: new Date(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return a failure
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// #endregion
|
||||||
|
|
||||||
|
// #region Public Functions
|
||||||
|
/**
|
||||||
|
* Get the details of a specific post
|
||||||
|
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
|
||||||
|
*/
|
||||||
|
public async getPostDetails( url: string ): Promise<Post | null> {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a list of posts from the mentioned page
|
||||||
|
* @param url
|
||||||
|
* @returns
|
||||||
|
*/
|
||||||
|
public async getPostsFromPage( url: string ): Promise<Array<string>> {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a list of pages by starting to crawl from a specific page.
|
||||||
|
* @param url The starting page, this will crawl as many pages as you mention
|
||||||
|
* @param pageCount The number of pages to crawl
|
||||||
|
*/
|
||||||
|
public async crawlPages( url: string, pageCount: number = 10 ): Promise<Array<string>> {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// #endregion
|
||||||
|
|
||||||
}
|
}
|
|
@ -0,0 +1,11 @@
|
||||||
|
import * as axiosPackage from 'axios';
|
||||||
|
const axios = axiosPackage.default;
|
||||||
|
|
||||||
|
export function getPageContents(url: string): Promise<axiosPackage.AxiosResponse<unknown, any>> {
|
||||||
|
// Return the axios function's promise
|
||||||
|
return axios.get(url, {
|
||||||
|
headers: {
|
||||||
|
'User-Agent': 'Mozilla/5.0',
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
|
@ -1,17 +1,169 @@
|
||||||
import {Post, Tag} from "../type/generic";
|
import {Post, Tag, LogEntry, LogType} from "../type/generic";
|
||||||
import {Scrapper} from "../class/Scrapper";
|
import {Scrapper} from "../class/Scrapper";
|
||||||
|
import {getPageContents} from "../helper/requestManager";
|
||||||
|
import * as cheerio from 'cheerio';
|
||||||
|
|
||||||
class Rule34xxx extends Scrapper {
|
export class Rule34xxx extends Scrapper {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
// Set the domain base of the current Scrapper as "rule34.xxx"
|
||||||
|
super("https://rule34.xxx");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the details of a specific post
|
* Get the details of a specific post
|
||||||
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
|
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
|
||||||
*/
|
*/
|
||||||
public async getPostDetails( url: string ): Promise<Post | null> {
|
public async getPostDetails( url: string ): Promise<Post | null> {
|
||||||
|
|
||||||
|
// Check if the provided link is valid
|
||||||
|
if ( !this.checkURLBase(url) ) {
|
||||||
|
throw new Error(`Invalid url provided`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send out the request to grab the contents of the post
|
||||||
|
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a list of posts from the mentioned page
|
||||||
|
* @param url
|
||||||
|
* @returns
|
||||||
|
*/
|
||||||
|
public async getPostsFromPage( url: string ): Promise<Array<string>> {
|
||||||
|
|
||||||
|
// Check if the provided link is valid
|
||||||
|
if ( !this.checkURLBase(url) ) {
|
||||||
|
throw new Error(`Invalid url provided`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize the page contents here
|
||||||
|
let pageContents: string = null;
|
||||||
|
|
||||||
|
// Send out the request to grab the contents of the post
|
||||||
|
try {
|
||||||
|
// Send out the initial Axios request to fetch the data from the page
|
||||||
|
await getPageContents(url)
|
||||||
|
.then( request => {
|
||||||
|
if ( request.status < 200 || request.status > 299 ) {
|
||||||
|
this.logs.push({
|
||||||
|
msg: `Invalid response code[${request.status}]`,
|
||||||
|
type: LogType.ERROR,
|
||||||
|
err: null,
|
||||||
|
data: null,
|
||||||
|
ts: new Date()
|
||||||
|
});
|
||||||
|
throw new Error(`Invalid response code[${request.status}]`);
|
||||||
|
}
|
||||||
|
|
||||||
|
pageContents = (request.data as string);
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
} catch ( err ) {
|
||||||
|
// "Handle" the error so that it's in the above .catch
|
||||||
|
this.logs.push({
|
||||||
|
msg: `[Error]::getPostsFromPage::`,
|
||||||
|
type: LogType.ERROR,
|
||||||
|
err: (err as Error),
|
||||||
|
data: null,
|
||||||
|
ts: new Date()
|
||||||
|
});
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process the page's posts with cheerio
|
||||||
|
const $ = cheerio.load((pageContents as string));
|
||||||
|
|
||||||
|
// Define the post List
|
||||||
|
const postList: Array<string> = [];
|
||||||
|
|
||||||
|
// Workaround I guess
|
||||||
|
let self = this;
|
||||||
|
|
||||||
|
// Go through all of the posts
|
||||||
|
$(`.thumb`).each( function() {
|
||||||
|
const href = $(this).find(`a`).attr(`href`);
|
||||||
|
if ( `${href}`.length >= `index.php?page=post&s=view&id=`.length )
|
||||||
|
postList.push(`${self.domain}/${href}`);
|
||||||
|
});
|
||||||
|
|
||||||
|
return postList;
|
||||||
|
}
|
||||||
|
|
||||||
|
public async crawlPages( url: string, pageCount: number = 10, batchSize: number = 1 ): Promise<Array<string>> {
|
||||||
|
|
||||||
|
// Check if the provided link is valid
|
||||||
|
if ( !this.checkURLBase(url) ) {
|
||||||
|
throw new Error(`Invalid url provided`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// A list of all of the found pages
|
||||||
|
let foundPages = new Array<string>();
|
||||||
|
// The next url we are hitting
|
||||||
|
let nextPage: string = url;
|
||||||
|
|
||||||
|
// Go through as many pages as requested
|
||||||
|
for ( let i = 0; i < pageCount; i++ ) {
|
||||||
|
|
||||||
|
if ( this.verbose ) {
|
||||||
|
console.log(`[${i+1}/${pageCount}]Crawling ${nextPage}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize the page contents here
|
||||||
|
let pageContents: string = null;
|
||||||
|
|
||||||
|
// Try and find the button to the next page
|
||||||
|
try {
|
||||||
|
// Send out the initial Axios request to fetch the data from the page
|
||||||
|
await getPageContents(nextPage)
|
||||||
|
.then( request => {
|
||||||
|
if ( request.status < 200 || request.status > 299 ) {
|
||||||
|
this.logs.push({
|
||||||
|
msg: `Invalid response code[${request.status}]`,
|
||||||
|
type: LogType.ERROR,
|
||||||
|
err: null,
|
||||||
|
data: null,
|
||||||
|
ts: new Date()
|
||||||
|
});
|
||||||
|
throw new Error(`Invalid response code[${request.status}]`);
|
||||||
|
}
|
||||||
|
|
||||||
|
pageContents = (request.data as string);
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
} catch ( err ) {
|
||||||
|
// "Handle" the error so that it's in the above .catch
|
||||||
|
this.logs.push({
|
||||||
|
msg: `[Error]::getPostsFromPage::`,
|
||||||
|
type: LogType.ERROR,
|
||||||
|
err: (err as Error),
|
||||||
|
data: null,
|
||||||
|
ts: new Date()
|
||||||
|
});
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process the page's posts with cheerio
|
||||||
|
const $ = cheerio.load((pageContents as string));
|
||||||
|
|
||||||
|
// Add the current page we are on to the list
|
||||||
|
foundPages.push(nextPage);
|
||||||
|
|
||||||
|
const nextPageButton = $(`a[alt="next"]`);
|
||||||
|
if ( nextPageButton.length > 0 ) {
|
||||||
|
nextPage = `${this.domain}/` + nextPageButton.attr(`href`);
|
||||||
|
} else {
|
||||||
|
// Since we didn't find the proper button, skip this page.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the found pages
|
||||||
|
return foundPages;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
26
src/test.ts
26
src/test.ts
|
@ -1,2 +1,26 @@
|
||||||
// This is the test file for the library, different tests are ran in here.
|
// This is the test file for the library, different tests are ran in here.
|
||||||
console.log(`Working I guess`);
|
import {Rule34xxx} from "./module/rule34xxx";
|
||||||
|
import {Post} from "./type/generic";
|
||||||
|
|
||||||
|
( async () => {
|
||||||
|
// Initialize the rule34 module
|
||||||
|
const r34: Rule34xxx = new Rule34xxx();
|
||||||
|
r34.verbose = true;
|
||||||
|
|
||||||
|
// Run the get post Details function
|
||||||
|
let postDetails: Array<string>;
|
||||||
|
await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=thelorope`, 35)
|
||||||
|
.then( postData => {
|
||||||
|
postDetails = postData;
|
||||||
|
})
|
||||||
|
.catch( err => {
|
||||||
|
console.log(err);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Display results
|
||||||
|
console.log({
|
||||||
|
logs: r34.logs,
|
||||||
|
result: postDetails
|
||||||
|
});
|
||||||
|
})();
|
||||||
|
|
||||||
|
|
|
@ -14,12 +14,12 @@ export interface Post {
|
||||||
/**
|
/**
|
||||||
* URL to the original post link
|
* URL to the original post link
|
||||||
*/
|
*/
|
||||||
url?: string,
|
url: string,
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A link to the full resolution image or video
|
* A link to the full resolution image or video
|
||||||
*/
|
*/
|
||||||
contentURL: string,
|
contentURL?: string,
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The optional link for the source of the image
|
* The optional link for the source of the image
|
||||||
|
@ -29,10 +29,23 @@ export interface Post {
|
||||||
/**
|
/**
|
||||||
* A list of all of the tags the post has
|
* A list of all of the tags the post has
|
||||||
*/
|
*/
|
||||||
tags: Array<Tag>,
|
tags?: Array<Tag>,
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The date of the post's creation
|
* The date of the post's creation
|
||||||
*/
|
*/
|
||||||
ts?: string,
|
ts?: string,
|
||||||
|
}
|
||||||
|
|
||||||
|
export enum LogType {
|
||||||
|
ERROR = `error`,
|
||||||
|
INFO = `info`,
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface LogEntry {
|
||||||
|
type: LogType,
|
||||||
|
msg: string,
|
||||||
|
data: any,
|
||||||
|
err: null | Error,
|
||||||
|
ts: Date,
|
||||||
}
|
}
|
Reference in New Issue