This repository has been archived on 2024-01-19. You can view files and clone it, but cannot push or open issues or pull requests.
HentaiSnatcher/src/class/Scrapper.ts

97 lines
2.7 KiB
TypeScript
Raw Normal View History

import {Post, Tag, LogEntry, LogType} from "../type/generic";
2021-10-21 01:10:27 +03:00
/**
* The base class of the scrappers, any of the website scrappers must extend this class
*/
2021-10-21 01:23:48 +03:00
export class Scrapper {
constructor(domain: string) {
// Set the domain
this.domain = domain;
};
2021-10-21 01:10:27 +03:00
/**
* An array of all of the logs
*/
public logs: Array<LogEntry> = [];
2021-10-21 01:10:27 +03:00
2021-10-21 01:23:48 +03:00
/**
* The fully qualified domain base without a trailing / of the website to scrap, for example "https://rule34.life"
2021-10-21 01:23:48 +03:00
*/
public domain: string = ``;
2021-10-21 01:10:27 +03:00
/**
* Display console logs
2021-10-21 01:10:27 +03:00
*/
public verbose: boolean = false;
2021-10-21 01:10:27 +03:00
// #region Protected Functions
protected checkURLBase(url: string) {
try {
// Try and build a new URL class
const instance: URL = new URL(url);
// Check if the origin matches ours
if (instance.origin == this.domain) {
// Return success
return true;
} else {
this.logs.push({
type: LogType.ERROR,
msg: `Invalid URL provided`,
data: {
url: this.domain,
origin: instance.origin
},
err: null,
ts: new Date(),
});
}
} catch ( err ) {
this.logs.push({
type: LogType.ERROR,
msg: `Failed to parse provided URL`,
data: null,
err: (err as Error),
ts: new Date(),
});
}
// Return a failure
return false;
}
// #endregion
// #region Public Functions
/**
* Get the details of a specific post
* @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
*/
public async getPostDetails( url: string ): Promise<Post | null> {
return null;
}
/**
* Get a list of posts from the mentioned page
* @param url
* @returns
*/
public async getPostsFromPage( url: string ): Promise<Array<string>> {
return [];
}
/**
* Get a list of pages by starting to crawl from a specific page.
* @param url The starting page, this will crawl as many pages as you mention
* @param pageCount The number of pages to crawl
*/
public async crawlPages( url: string, pageCount: number = 10 ): Promise<Array<string>> {
return [];
}
2021-10-21 01:10:27 +03:00
// #endregion
2021-10-21 01:10:27 +03:00
}