2021-10-21 02:14:34 +03:00
|
|
|
import {Post, Tag, LogEntry, LogType} from "../type/generic";
|
2021-10-21 01:10:27 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* The base class of the scrappers, any of the website scrappers must extend this class
|
|
|
|
*/
|
2021-10-21 01:23:48 +03:00
|
|
|
export class Scrapper {
|
|
|
|
|
|
|
|
constructor(domain: string) {
|
|
|
|
// Set the domain
|
|
|
|
this.domain = domain;
|
|
|
|
};
|
2021-10-21 01:10:27 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* An array of all of the logs
|
|
|
|
*/
|
2021-10-21 02:14:34 +03:00
|
|
|
public logs: Array<LogEntry> = [];
|
2021-10-21 01:10:27 +03:00
|
|
|
|
2021-10-21 01:23:48 +03:00
|
|
|
/**
|
2021-10-21 02:14:34 +03:00
|
|
|
* The fully qualified domain base without a trailing / of the website to scrap, for example "https://rule34.life"
|
2021-10-21 01:23:48 +03:00
|
|
|
*/
|
|
|
|
public domain: string = ``;
|
|
|
|
|
2021-10-21 01:10:27 +03:00
|
|
|
/**
|
2021-10-21 02:14:34 +03:00
|
|
|
* Display console logs
|
2021-10-21 01:10:27 +03:00
|
|
|
*/
|
2021-10-21 02:14:34 +03:00
|
|
|
public verbose: boolean = false;
|
2021-10-21 01:10:27 +03:00
|
|
|
|
2021-10-21 02:14:34 +03:00
|
|
|
// #region Protected Functions
|
|
|
|
|
|
|
|
protected checkURLBase(url: string) {
|
|
|
|
try {
|
|
|
|
// Try and build a new URL class
|
|
|
|
const instance: URL = new URL(url);
|
|
|
|
|
|
|
|
// Check if the origin matches ours
|
|
|
|
if (instance.origin == this.domain) {
|
|
|
|
// Return success
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
this.logs.push({
|
|
|
|
type: LogType.ERROR,
|
|
|
|
msg: `Invalid URL provided`,
|
|
|
|
data: {
|
|
|
|
url: this.domain,
|
|
|
|
origin: instance.origin
|
|
|
|
},
|
|
|
|
err: null,
|
|
|
|
ts: new Date(),
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
} catch ( err ) {
|
|
|
|
this.logs.push({
|
|
|
|
type: LogType.ERROR,
|
|
|
|
msg: `Failed to parse provided URL`,
|
|
|
|
data: null,
|
|
|
|
err: (err as Error),
|
|
|
|
ts: new Date(),
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return a failure
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// #endregion
|
|
|
|
|
|
|
|
// #region Public Functions
|
|
|
|
/**
|
|
|
|
* Get the details of a specific post
|
|
|
|
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
|
|
|
|
*/
|
|
|
|
public async getPostDetails( url: string ): Promise<Post | null> {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get a list of posts from the mentioned page
|
|
|
|
* @param url
|
|
|
|
* @returns
|
|
|
|
*/
|
|
|
|
public async getPostsFromPage( url: string ): Promise<Array<string>> {
|
|
|
|
return [];
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get a list of pages by starting to crawl from a specific page.
|
|
|
|
* @param url The starting page, this will crawl as many pages as you mention
|
|
|
|
* @param pageCount The number of pages to crawl
|
|
|
|
*/
|
|
|
|
public async crawlPages( url: string, pageCount: number = 10 ): Promise<Array<string>> {
|
|
|
|
return [];
|
|
|
|
}
|
2021-10-21 01:10:27 +03:00
|
|
|
|
2021-10-21 02:14:34 +03:00
|
|
|
// #endregion
|
2021-10-21 01:10:27 +03:00
|
|
|
|
|
|
|
}
|