HentaiSnatcher/src/class/Scrapper.ts

import {Post, Tag, LogEntry, LogType} from "../type/generic";

/**
 * The base class of the scrappers, any of the website scrappers must extend this class
 */
export class Scrapper {

    constructor(domain: string) {
        // Set the domain
        this.domain = domain;
    };

    /**
     * An array of all of the logs
     */
    public logs: Array<LogEntry> = [];

    /**
     * The fully qualified domain base without a trailing / of the website to scrap, for example "https://rule34.life"
     */
    public domain: string = ``;

    /**
     * Display console logs
     */
    public verbose: boolean = false;

    // #region Protected Functions

        protected checkURLBase(url: string) {
            try {
                // Try and build a new URL class
                const instance: URL = new URL(url);

                // Check if the origin matches ours
                if (instance.origin == this.domain) {
                    // Return success
                    return true;
                } else {
                    this.logs.push({
                        type: LogType.ERROR,
                        msg: `Invalid URL provided`,
                        data: {
                            url:    this.domain,
                            origin: instance.origin
                        },
                        err: null,
                        ts: new Date(),
                    });
                }

            } catch ( err ) {
                this.logs.push({
                    type: LogType.ERROR,
                    msg:  `Failed to parse provided URL`,
                    data: null,
                    err:  (err as Error),
                    ts:   new Date(),
                });
            }

            // Return a failure
            return false;
        }

    // #endregion

    // #region Public Functions
        /**
         * Get the details of a specific post
         * @param url The URL to the post, this must be the actual page which contains the image, tags, etc...
         */
        public async getPostDetails( url: string ): Promise<Post | null> {
            return null;
        }

        /**
         * Get a list of posts from the mentioned page
         * @param url 
         * @returns 
         */
        public async getPostsFromPage( url: string ): Promise<Array<string>> {
            return [];
        }

        /**
         * Get a list of pages by starting to crawl from a specific page.
         * @param url The starting page, this will crawl as many pages as you mention
         * @param pageCount The number of pages to crawl
         */
        public async crawlPages( url: string, pageCount: number = 10 ): Promise<Array<string>> {
            return [];
        }

    // #endregion

}
+ Some more work on rule34.xxx + rule34xxx Get all posts from a page + rule34xxx Crawl pages 2021-10-21 02:14:34 +03:00			`import {Post, Tag, LogEntry, LogType} from "../type/generic";`
Initial Commit 2021-10-21 01:10:27 +03:00
			`/**`
			`* The base class of the scrappers, any of the website scrappers must extend this class`
			`*/`
+ Started work on rule34.xxx module 2021-10-21 01:23:48 +03:00			`export class Scrapper {`

			`constructor(domain: string) {`
			`// Set the domain`
			`this.domain = domain;`
			`};`
Initial Commit 2021-10-21 01:10:27 +03:00
			`/**`
			`* An array of all of the logs`
			`*/`
+ Some more work on rule34.xxx + rule34xxx Get all posts from a page + rule34xxx Crawl pages 2021-10-21 02:14:34 +03:00			`public logs: Array<LogEntry> = [];`
Initial Commit 2021-10-21 01:10:27 +03:00
+ Started work on rule34.xxx module 2021-10-21 01:23:48 +03:00			`/**`
+ Some more work on rule34.xxx + rule34xxx Get all posts from a page + rule34xxx Crawl pages 2021-10-21 02:14:34 +03:00			`* The fully qualified domain base without a trailing / of the website to scrap, for example "https://rule34.life"`
+ Started work on rule34.xxx module 2021-10-21 01:23:48 +03:00			`*/`
			public domain: string = ``;

Initial Commit 2021-10-21 01:10:27 +03:00			`/**`
+ Some more work on rule34.xxx + rule34xxx Get all posts from a page + rule34xxx Crawl pages 2021-10-21 02:14:34 +03:00			`* Display console logs`
Initial Commit 2021-10-21 01:10:27 +03:00			`*/`
+ Some more work on rule34.xxx + rule34xxx Get all posts from a page + rule34xxx Crawl pages 2021-10-21 02:14:34 +03:00			`public verbose: boolean = false;`
Initial Commit 2021-10-21 01:10:27 +03:00
+ Some more work on rule34.xxx + rule34xxx Get all posts from a page + rule34xxx Crawl pages 2021-10-21 02:14:34 +03:00			`// #region Protected Functions`

			`protected checkURLBase(url: string) {`
			`try {`
			`// Try and build a new URL class`
			`const instance: URL = new URL(url);`

			`// Check if the origin matches ours`
			`if (instance.origin == this.domain) {`
			`// Return success`
			`return true;`
			`} else {`
			`this.logs.push({`
			`type: LogType.ERROR,`
			msg: `Invalid URL provided`,
			`data: {`
			`url: this.domain,`
			`origin: instance.origin`
			`},`
			`err: null,`
			`ts: new Date(),`
			`});`
			`}`

			`} catch ( err ) {`
			`this.logs.push({`
			`type: LogType.ERROR,`
			msg: `Failed to parse provided URL`,
			`data: null,`
			`err: (err as Error),`
			`ts: new Date(),`
			`});`
			`}`

			`// Return a failure`
			`return false;`
			`}`

			`// #endregion`

			`// #region Public Functions`
			`/**`
			`* Get the details of a specific post`
			`* @param url The URL to the post, this must be the actual page which contains the image, tags, etc...`
			`*/`
			`public async getPostDetails( url: string ): Promise<Post \| null> {`
			`return null;`
			`}`

			`/**`
			`* Get a list of posts from the mentioned page`
			`* @param url`
			`* @returns`
			`*/`
			`public async getPostsFromPage( url: string ): Promise<Array<string>> {`
			`return [];`
			`}`

			`/**`
			`* Get a list of pages by starting to crawl from a specific page.`
			`* @param url The starting page, this will crawl as many pages as you mention`
			`* @param pageCount The number of pages to crawl`
			`*/`
			`public async crawlPages( url: string, pageCount: number = 10 ): Promise<Array<string>> {`
			`return [];`
			`}`
Initial Commit 2021-10-21 01:10:27 +03:00
+ Some more work on rule34.xxx + rule34xxx Get all posts from a page + rule34xxx Crawl pages 2021-10-21 02:14:34 +03:00			`// #endregion`
Initial Commit 2021-10-21 01:10:27 +03:00
			`}`