Non-Potable Macchiato

    node-html-scraper
    TypeScript icon, indicating that this package has built-in type declarations

    1.0.9 • Public • Published

    node-html-scraper

    This is a typescript library that will help you parser, search and scrap data from html. with this library it will be very easy to convert the html to json with a very little code.

    Installation

    npm i node-html-scraper

    Usage

    interface ITest {
      name: string;
      image: string;
      children: IChild[];
    }
    
    interface IChild {
      name: any;
    }
    import Parser from 'node-html-scraper'
    var url = "www.google.se";
    var fetchedHtml=`<html>
    <div class="item">
    <p class="title">Item header 1</p>
    <img src="/test.png" />
    <ul>
    <li>item 1</li>
    <li>item 2</li>
    </ul>
    </div>
    <div class="item">
    <p class="title">Item header 2</p> 
    <img src="/test2.png" />
    <ul>
    <li>item 1</li>
    <li>item 2</li>
    </ul>
    </div>
    </html>`
    // we add the url here is to be able to complete the image src or link href if its a relative path.
    var parsedHtml = Parser.parseHtml(fetchedHtml, url);
    var items = parsedHtml.find(".item").toObject<ITest>()
                .field(x => x.name, a => a.select('.title').text())
                .field(x=> x.image, x=> x.select("img").attr("src").url())   
                .field(x => x.children, x => x.find("ul > li").map(m=> { 
                     return {
                         name: m.text(),  
                     };
                    })
                ).toList();  
            console.log(item);  

    Tranlsate a searchable Config

    You could also build a config json where you build your object throw it.

    note: for path see the settings below.

       var config = [
          {
            column: 'name',
            path: 's=.title&te=false',
          },
          {
            column: 'image',
            path: 's=img&at=src&ul',
          },
          {
            column: 'children',
            isArray: true,
            path: 'f=ul > li',
            nested: [
              {
                column: 'name',
                path: 'te=false',
              },
            ],
          },
        ];
    var parsedHtml = Parser.parseHtml(fetchedHtml, url);
    // see the result below
    var items = parsedHtml.find(".item").toObject<ITest>().translate(config).toList();

    Result

    This is how items will look like when its returned

    ▼0:
    name:"Item header 1"
    image:"www.google.se/test.png"
    ▼children:
    ►0:{name:"item 1"}
    ►1:{name:"item 2"}
    
    ▼1:
    name:"Item header 2"
    image:"www.google.se/test2.png"
    ▼children:
    ►0:{name:"item 1"}
    ►1:{name:"item 2"}

    IConfig Path selector settings

    you could build your path to your html node by using the settings below. Those path are seperated by & as queryString.

    // example s=.title
    s= select
    // example f=.title
    f= find
    // example s=.title&te=false/true
    te= text
    // example at=src
    at= attr
    // example ch
    ch= cleanHtml
    // example in
    ih= innerHtml
    // example oh
    oh= outerHtml
    // example pa
    pa= parent
    // example eq=0
    eq= index
    // example ft
    ft= first
    // lt
    lt= last
    // example ct=.item
    ct= closest
    // example at=src&ul
    ul= url

    ISelector

      /** find(querySelectorAll) items by its selector. available selector is css5  */
      find: (selector: string) => ISelector;
        /** find(querySelector) an item by its selector. available selector is css5  */
      select: (selector: string) => ISelector;
        /** get  closest by selector */
      closest: (selector: string) => ISelector;
       /** get attr of the element, eg .attr("title|alt").text()
        * This will try to find "title" if it failed or its empty it will try "alt" instead.
        */
      attr: (selector: string) => ISelector;
         /** get the full url. if the attr is full url then return it or return the baseurl/attr value
          * eg .attr("src").url()
           */
      url: () => string;
       /** get the last element of the selected items */
      last: () => ISelector;
       /** get the selected text, this differ from innerhtml as the text will be cleaned */
      text: (structuredText?: boolean) => string;
       /** selected node value  */
      nodeValue: () => string | undefined;
       /** attr value without changing it  */
      attValue: () => string | undefined;
        /** remove script, style , input etc and clean the html object before returned it  */
      cleanInnerHTML: () => string;
        /** the orginal innerhtml without changing it  */
      innerHTML: () => string;
       /** the orginal outerhtml without changing it  */
      outerHTML: () => string;
         /** return all innertext of the selected items  */
      textArray: () => string[];
         /** the selected attr has value  */
      hasValue: () => boolean;
         /** the selected element exists  */
      hasElement: () => boolean;
        /** the selected elements exists  */
      hasElements: () => boolean;
        /** length of the selected elements  */
      length:()=> number;
        /** remove selected items. available selector is css5   */
      remove: (selector: string) => ISelector;
         /** loop throw each item  */
      forEach:<T>(func: (jq: ISelector, index?: number, arr?: T[]) => void)=> void;
         /** loop throw each item and do a manual seach  */
      where:<T>(func: (jq: ISelector, index?: number, arr?: T[]) => boolean) => ISelector;
         /** loop throw each item and return the desire extracted result   */
      map:<T>(func: (jq: ISelector, index?: number, arr?: T[]) => T) => T[];
      reduce: <T>(func: (itemType: any, jq: ISelector, index?: number, arr?: T[]) => void,itemType: any) => any;
           /** parent of the selected item   */
      parent:()=> ISelector;
         /** find at index of the selected items   */
      eq: (index: number) => ISelector;
           /** children of the selected item   */
      children: ()=> ISelector;
      /** convert the parsed html to object   */
      toObject:<T extends {}>() => IObjectContructor<T>

    Install

    npm i node-html-scraper

    DownloadsWeekly Downloads

    10

    Version

    1.0.9

    License

    MIT

    Unpacked Size

    60.4 kB

    Total Files

    19

    Last publish

    Collaborators

    • alen.toma