+ Some more work on rule34.xxx
+ rule34xxx Get all posts from a page + rule34xxx Crawl pages
This commit is contained in:
parent
11224096f9
commit
2ed81fb668
|
@ -1,4 +1,4 @@
|
|||
import {Post, Tag} from "../type/generic";
|
||||
import {Post, Tag, LogEntry, LogType} from "../type/generic";
|
||||
|
||||
/**
|
||||
* The base class of the scrappers, any of the website scrappers must extend this class
|
||||
|
@ -13,29 +13,85 @@ export class Scrapper {
|
|||
/**
|
||||
* An array of all of the logs
|
||||
*/
|
||||
public logs: Array<any> = [];
|
||||
public logs: Array<LogEntry> = [];
|
||||
|
||||
/**
|
||||
* The fully qualified domain of the website to scrap, for example "rule34.life"
|
||||
* The fully qualified domain base without a trailing / of the website to scrap, for example "https://rule34.life"
|
||||
*/
|
||||
public domain: string = ``;
|
||||
|
||||
/**
|
||||
* Get the details of a specific post
|
||||
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
|
||||
* Display console logs
|
||||
*/
|
||||
public async getPostDetails( url: string ): Promise<Post | null> {
|
||||
return null;
|
||||
}
|
||||
public verbose: boolean = false;
|
||||
|
||||
/**
|
||||
* Get a list of posts from the mentioned page
|
||||
* @param url
|
||||
* @returns
|
||||
*/
|
||||
public async getPostsFromPage( url: string ): Promise<Array<Post>> {
|
||||
return [];
|
||||
}
|
||||
// #region Protected Functions
|
||||
|
||||
protected checkURLBase(url: string) {
|
||||
try {
|
||||
// Try and build a new URL class
|
||||
const instance: URL = new URL(url);
|
||||
|
||||
// Check if the origin matches ours
|
||||
if (instance.origin == this.domain) {
|
||||
// Return success
|
||||
return true;
|
||||
} else {
|
||||
this.logs.push({
|
||||
type: LogType.ERROR,
|
||||
msg: `Invalid URL provided`,
|
||||
data: {
|
||||
url: this.domain,
|
||||
origin: instance.origin
|
||||
},
|
||||
err: null,
|
||||
ts: new Date(),
|
||||
});
|
||||
}
|
||||
|
||||
} catch ( err ) {
|
||||
this.logs.push({
|
||||
type: LogType.ERROR,
|
||||
msg: `Failed to parse provided URL`,
|
||||
data: null,
|
||||
err: (err as Error),
|
||||
ts: new Date(),
|
||||
});
|
||||
}
|
||||
|
||||
// Return a failure
|
||||
return false;
|
||||
}
|
||||
|
||||
// #endregion
|
||||
|
||||
// #region Public Functions
|
||||
/**
|
||||
* Get the details of a specific post
|
||||
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
|
||||
*/
|
||||
public async getPostDetails( url: string ): Promise<Post | null> {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a list of posts from the mentioned page
|
||||
* @param url
|
||||
* @returns
|
||||
*/
|
||||
public async getPostsFromPage( url: string ): Promise<Array<string>> {
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a list of pages by starting to crawl from a specific page.
|
||||
* @param url The starting page, this will crawl as many pages as you mention
|
||||
* @param pageCount The number of pages to crawl
|
||||
*/
|
||||
public async crawlPages( url: string, pageCount: number = 10 ): Promise<Array<string>> {
|
||||
return [];
|
||||
}
|
||||
|
||||
// #endregion
|
||||
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
import * as axiosPackage from 'axios';
|
||||
const axios = axiosPackage.default;
|
||||
|
||||
export function getPageContents(url: string): Promise<axiosPackage.AxiosResponse<unknown, any>> {
|
||||
// Return the axios function's promise
|
||||
return axios.get(url, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0',
|
||||
}
|
||||
});
|
||||
}
|
|
@ -1,7 +1,14 @@
|
|||
import {Post, Tag} from "../type/generic";
|
||||
import {Scrapper} from "../class/Scrapper";
|
||||
import {Post, Tag, LogEntry, LogType} from "../type/generic";
|
||||
import {Scrapper} from "../class/Scrapper";
|
||||
import {getPageContents} from "../helper/requestManager";
|
||||
import * as cheerio from 'cheerio';
|
||||
|
||||
class Rule34xxx extends Scrapper {
|
||||
export class Rule34xxx extends Scrapper {
|
||||
|
||||
constructor() {
|
||||
// Set the domain base of the current Scrapper as "rule34.xxx"
|
||||
super("https://rule34.xxx");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the details of a specific post
|
||||
|
@ -9,9 +16,154 @@ class Rule34xxx extends Scrapper {
|
|||
*/
|
||||
public async getPostDetails( url: string ): Promise<Post | null> {
|
||||
|
||||
// Check if the provided link is valid
|
||||
if ( !this.checkURLBase(url) ) {
|
||||
throw new Error(`Invalid url provided`);
|
||||
}
|
||||
|
||||
// Send out the request to grab the contents of the post
|
||||
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a list of posts from the mentioned page
|
||||
* @param url
|
||||
* @returns
|
||||
*/
|
||||
public async getPostsFromPage( url: string ): Promise<Array<string>> {
|
||||
|
||||
// Check if the provided link is valid
|
||||
if ( !this.checkURLBase(url) ) {
|
||||
throw new Error(`Invalid url provided`);
|
||||
}
|
||||
|
||||
// Initialize the page contents here
|
||||
let pageContents: string = null;
|
||||
|
||||
// Send out the request to grab the contents of the post
|
||||
try {
|
||||
// Send out the initial Axios request to fetch the data from the page
|
||||
await getPageContents(url)
|
||||
.then( request => {
|
||||
if ( request.status < 200 || request.status > 299 ) {
|
||||
this.logs.push({
|
||||
msg: `Invalid response code[${request.status}]`,
|
||||
type: LogType.ERROR,
|
||||
err: null,
|
||||
data: null,
|
||||
ts: new Date()
|
||||
});
|
||||
throw new Error(`Invalid response code[${request.status}]`);
|
||||
}
|
||||
|
||||
pageContents = (request.data as string);
|
||||
})
|
||||
|
||||
|
||||
} catch ( err ) {
|
||||
// "Handle" the error so that it's in the above .catch
|
||||
this.logs.push({
|
||||
msg: `[Error]::getPostsFromPage::`,
|
||||
type: LogType.ERROR,
|
||||
err: (err as Error),
|
||||
data: null,
|
||||
ts: new Date()
|
||||
});
|
||||
throw err;
|
||||
}
|
||||
|
||||
// Process the page's posts with cheerio
|
||||
const $ = cheerio.load((pageContents as string));
|
||||
|
||||
// Define the post List
|
||||
const postList: Array<string> = [];
|
||||
|
||||
// Workaround I guess
|
||||
let self = this;
|
||||
|
||||
// Go through all of the posts
|
||||
$(`.thumb`).each( function() {
|
||||
const href = $(this).find(`a`).attr(`href`);
|
||||
if ( `${href}`.length >= `index.php?page=post&s=view&id=`.length )
|
||||
postList.push(`${self.domain}/${href}`);
|
||||
});
|
||||
|
||||
return postList;
|
||||
}
|
||||
|
||||
public async crawlPages( url: string, pageCount: number = 10, batchSize: number = 1 ): Promise<Array<string>> {
|
||||
|
||||
// Check if the provided link is valid
|
||||
if ( !this.checkURLBase(url) ) {
|
||||
throw new Error(`Invalid url provided`);
|
||||
}
|
||||
|
||||
// A list of all of the found pages
|
||||
let foundPages = new Array<string>();
|
||||
// The next url we are hitting
|
||||
let nextPage: string = url;
|
||||
|
||||
// Go through as many pages as requested
|
||||
for ( let i = 0; i < pageCount; i++ ) {
|
||||
|
||||
if ( this.verbose ) {
|
||||
console.log(`[${i+1}/${pageCount}]Crawling ${nextPage}`);
|
||||
}
|
||||
|
||||
// Initialize the page contents here
|
||||
let pageContents: string = null;
|
||||
|
||||
// Try and find the button to the next page
|
||||
try {
|
||||
// Send out the initial Axios request to fetch the data from the page
|
||||
await getPageContents(nextPage)
|
||||
.then( request => {
|
||||
if ( request.status < 200 || request.status > 299 ) {
|
||||
this.logs.push({
|
||||
msg: `Invalid response code[${request.status}]`,
|
||||
type: LogType.ERROR,
|
||||
err: null,
|
||||
data: null,
|
||||
ts: new Date()
|
||||
});
|
||||
throw new Error(`Invalid response code[${request.status}]`);
|
||||
}
|
||||
|
||||
pageContents = (request.data as string);
|
||||
})
|
||||
|
||||
|
||||
} catch ( err ) {
|
||||
// "Handle" the error so that it's in the above .catch
|
||||
this.logs.push({
|
||||
msg: `[Error]::getPostsFromPage::`,
|
||||
type: LogType.ERROR,
|
||||
err: (err as Error),
|
||||
data: null,
|
||||
ts: new Date()
|
||||
});
|
||||
throw err;
|
||||
}
|
||||
|
||||
// Process the page's posts with cheerio
|
||||
const $ = cheerio.load((pageContents as string));
|
||||
|
||||
// Add the current page we are on to the list
|
||||
foundPages.push(nextPage);
|
||||
|
||||
const nextPageButton = $(`a[alt="next"]`);
|
||||
if ( nextPageButton.length > 0 ) {
|
||||
nextPage = `${this.domain}/` + nextPageButton.attr(`href`);
|
||||
} else {
|
||||
// Since we didn't find the proper button, skip this page.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the found pages
|
||||
return foundPages;
|
||||
}
|
||||
|
||||
}
|
26
src/test.ts
26
src/test.ts
|
@ -1,2 +1,26 @@
|
|||
// This is the test file for the library, different tests are ran in here.
|
||||
console.log(`Working I guess`);
|
||||
import {Rule34xxx} from "./module/rule34xxx";
|
||||
import {Post} from "./type/generic";
|
||||
|
||||
( async () => {
|
||||
// Initialize the rule34 module
|
||||
const r34: Rule34xxx = new Rule34xxx();
|
||||
r34.verbose = true;
|
||||
|
||||
// Run the get post Details function
|
||||
let postDetails: Array<string>;
|
||||
await r34.crawlPages(`https://rule34.xxx/index.php?page=post&s=list&tags=thelorope`, 35)
|
||||
.then( postData => {
|
||||
postDetails = postData;
|
||||
})
|
||||
.catch( err => {
|
||||
console.log(err);
|
||||
});
|
||||
|
||||
// Display results
|
||||
console.log({
|
||||
logs: r34.logs,
|
||||
result: postDetails
|
||||
});
|
||||
})();
|
||||
|
||||
|
|
|
@ -14,12 +14,12 @@ export interface Post {
|
|||
/**
|
||||
* URL to the original post link
|
||||
*/
|
||||
url?: string,
|
||||
url: string,
|
||||
|
||||
/**
|
||||
* A link to the full resolution image or video
|
||||
*/
|
||||
contentURL: string,
|
||||
contentURL?: string,
|
||||
|
||||
/**
|
||||
* The optional link for the source of the image
|
||||
|
@ -29,10 +29,23 @@ export interface Post {
|
|||
/**
|
||||
* A list of all of the tags the post has
|
||||
*/
|
||||
tags: Array<Tag>,
|
||||
tags?: Array<Tag>,
|
||||
|
||||
/**
|
||||
* The date of the post's creation
|
||||
*/
|
||||
ts?: string,
|
||||
}
|
||||
|
||||
export enum LogType {
|
||||
ERROR = `error`,
|
||||
INFO = `info`,
|
||||
}
|
||||
|
||||
export interface LogEntry {
|
||||
type: LogType,
|
||||
msg: string,
|
||||
data: any,
|
||||
err: null | Error,
|
||||
ts: Date,
|
||||
}
|
Reference in New Issue