crawler-url-parser

    2.0.5 • Public • Published

    crawler-url-parser

    An URL parser for crawling purpose

    version downloads node status

    Installation

    npm install crawler-url-parser

    Usage

    Parse

    const cup = require('crawler-url-parser');
     
    //// parse(current_url[,base_url])
    let result = cup.parse("http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
     
    console.log(result.url);
    // http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2
     
    console.log(result.baseurl);
    // null
     
    console.log(result.normalized);
    // http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2
     
    console.log(result.host); 
    // question.stackoverflow.com
     
    console.log(result.domain); 
    // stackoverflow.com
     
    console.log(result.subdomain); 
    // question
     
    console.log(result.protocol); 
    // http:
     
    console.log(result.path); 
    // /aaa/bbb/ddd
     
    console.log(result.search); 
    // q1=query1&q2=query2
     
    console.log(result.querycount); 
    // 2

    Parse with baseURL

    const cup = require('crawler-url-parser');
     
    //// parse(current_url[,base_url])
    let result = cup.parse("../ddd?q1=query1&q2=query2","http://question.stackoverflow.com/aaa/bbb/ccc/");
     
    console.log(result.url);
    // http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2
     
    console.log(result.baseurl);
    // http://question.stackoverflow.com/aaa/bbb/ccc
     
    console.log(result.normalized);
    // http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2
     
    console.log(result.host); 
    // question.stackoverflow.com
     
    console.log(result.domain); 
    // stackoverflow.com
     
    console.log(result.subdomain); 
    // question
     
    console.log(result.protocol); 
    // http:
     
    console.log(result.path); 
    // /aaa/bbb/ddd
     
    console.log(result.search); 
    // q1=query1&q2=query2
     
    console.log(result.querycount); 
    // 2

    Extract

    const cup = require('crawler-url-parser');
     
    //// extract(html_str,current_url);
    let htmlStr='<html><body> \
        <a href="http://best.question.stackoverflow.com">subdomain</a><br /> \
        <a href="http://faq.stackoverflow.com">subdomain</a><br /> \
        <a href="http://stackoverflow.com">updomain</a><br /> \
        <a href="http://www.google.com">external</a><br /> \
        <a href="http://www.facebook.com">external</a><br /> \
        <a href="http://question.stackoverflow.com/aaa/bbb/ccc">sublevel</a><br /> \
        <a href="http://question.stackoverflow.com/aaa/bbb/zzz">sublevel</a><br /> \
        <a href="http://question.stackoverflow.com/aaa/">uplevel</a><br /> \
        <a href="http://question.stackoverflow.com/aaa/ddd">samelevel</a><br /> \
        <a href="http://question.stackoverflow.com/aaa/eee">samelevel</a><br /> \
        <a href="http://question.stackoverflow.com/aaa/ddd/eee">internal</a><br /> \
        <a href="http://question.stackoverflow.com/zzz">internal</a><br /> \
    </body></html>';
     
    let currentUrl= "http://question.stackoverflow.com/aaa/bbb";
    let urls = cup.extract(htmlStr,currentUrl);
     
    console.log(urls[0].type); //subdomain
    console.log(urls[1].type); //subdomain
    console.log(urls[2].type); //updomain
    console.log(urls[3].type); //external
    console.log(urls[4].type); //external
    console.log(urls[5].type); //sublevel
    console.log(urls[6].type); //sublevel
    console.log(urls[7].type); //uplevel
    console.log(urls[8].type); //samelevel
    console.log(urls[9].type); //samelevel
    console.log(urls[10].type); //internal
    console.log(urls[11].type); //subdomain
     

    Level

    const cup = require('crawler-url-parser');
     
    //// gettype(current_url,base_url);
    let level = cup.gettype("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc");
    console.log(level); //sublevel
     
    level = cup.gettype("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc");
    console.log(level); //uplevel
     
    level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc");
    console.log(level); //samelevel
     
    level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc");
    console.log(level); //external

    Test

    Support

    I use this package actively myself, so it has my top priority. You can chat on WhatsApp about any infos, ideas and suggestions.

    WhatsApp

    Submitting an Issue

    If you find a bug or a mistake, you can help by submitting an issue to GitLab Repository

    Creating a Merge Request

    GitLab calls it merge request instead of pull request.

    License

    MIT licensed and all it's dependencies are MIT or BSD licensed.

    Install

    npm i crawler-url-parser

    DownloadsWeekly Downloads

    189

    Version

    2.0.5

    License

    MIT

    Last publish

    Collaborators

    • avatar