爬虫html分析模块
用于分析html文档中所需的数据
字段的分析策略
- normal
基础模式;分析单个字段。
事例配置
{"key": "num","selector": ["#text_Keywords"],"dealStrategy": "normal","methodInfo": { "val": [] },"formats": [{ "key": "trim", "settings": { "start": true, "end": true, "mimddle": true } },{ "key": "regexp", "settings": { "regexp": "/\\d+/", "scope": "i", "index": 0 } },{ "key": "num" }]}
- array
数组模式;可以分析出数组字段。
事例配置
{"key": "array","selector": ["#ylHnTime li"],"dealStrategy": "array","data": [{"key": "name","selector": ["a"],"methodInfo": { "text": [] },"dealStrategy": "normal"}]}
- switch,case
switch模式;用于解析飘忽不定的字段,必须和case搭配来使用。
事例配置
{"selector": ["#ylHnTime li"],"dealStrategy": "switch","data": [{"selector": "a","methodInfo": { "attr": ["title"] },"match": "0-1岁","data": [{"key": "tag","selector": ["a"],"formats": [{ "str": [] }],"methodInfo": { "text": [] }}],"dealStrategy": "case"}],}
- object
object模式;解析带数据结构的字段。
实例配置
{"key": "obj","selector": ["#ylNav2"],"dealStrategy": "object","data": [{"key": "title","selector": ["a.ylHdNavTt"],"methodInfo": { "text": [] },"dealStrategy": "normal"}, {"key": "title-1","selector": [".ylHdNavCon a"],"dealStrategy": "array","data": [{"key": "","selector": [],"methodInfo": { "text": [] },"dealStrategy": "normal"}]}]}
- or
or模式;用于字段有多个不确定的地方可以取值,只命中一个。
实例配置
{"key": "cur-text","dealStrategy": "or","data": [{"selector": [".zsTobTabUl .cur a"],"methodInfo": { "text": [] },"dealStrategy": "normal"},{"selector": [".key_main .key_ul .hover"],"methodInfo": { "text": [] },"dealStrategy": "normal"}]}
字段的format策略
- date; 转换成标准日期
参数结构
{"type":"object","properties":{"format":{"title":"日期的format格式","type":"string"}}}
返回值类型: string
- json;转换成json格式
参数结构
{"type":"object","properties":{"parse":{"title":"是否转换","type":"boolean"},"func":{"title":"转换方法","type":"string"}}}
返回值类型: object
- num;转换成数字
参数结构: 无参数
返回值类型: number
- qs;转换地址栏参数
参数结构
{"type":"object","properties":{"pointer":{"title":"地址栏参数的路径","type":"boolean"},}}
返回值类型: string
- regexp;通过正则转换数据
参数结构
{"type":"object","required":["regexp"],"properties":{"regexp":{"title":"正则规则","type":"string"},"scope":{"title":"正则的scope","type":"string"},"index":{"title":"正则matchs的索引","type":"number"},}}
返回值类型: string
- split;通过split分栏转换数组数据
参数结构
{"type":"object","required":["splitOf","start"],"properties":{"splitOf":{"title":"分隔符","type":"string"},"start":{"title":"起始索引","type":"number"},"end":{"title":"结束索引","type":"number"},"join":{"title":"转换字符串分隔符","type":"string"},}}
返回值类型: string | Array
7.1 trim;去空格
参数结构
{"type":"object","properties":{"middle":{"title":"中间空格","type":"string"},"start":{"title":"起始空格","type":"number"},"end":{"title":"结束空格","type":"number"}}}
返回值类型: string
当前支持的模式
- 分析页面
partten: role:crawler.plugin.html,cmd:html
测试地址:http://172.16.112.215:9002/act POST
返回数据结构:
{"type":"array","description":"返回的数据结构","items":{"title":"单个result的数据","properties":{"rule":{"title":"当前地址所使用的规则","type":"object"},"result":{"title":"当前规则解析出来的数据","type":"object"}}}}
参数数据结构:
{"type":"object","description":"参数","required":["queueItem","pages"],"definitions": {"data": {"type":"object","title":"字段配置","properties":{"key":{"title":"字段名","type":"string"},"selector":{"type":"array","title":"jquery选择器代码","items":{"type":"string","title":"字段选择器"}},"removeSelector":{"type":"array","title":"需要删除的jquery选择器代码","items":{"type":"string","title":"字段选择器"}},"methodInfo":{"title":"调用的方法信息","type":"object"},"htmlStrategy":{"title":"html分析策略","type":"string"},"dealStrategy":{"title":"字段分析策略"},"match":{"title":"当使用match策略的时候,需要匹配的信息","type":"string"},"data":{"type":"array","title":"嵌套的字段","items":{"$ref":"#/definitions/data"}}}}},"properties":{"pages":{"type":"object","description":"地址过滤配置","required":["key","fields"],"properties":{"key":{"type":"string","title":"page的唯一字段"},"path":{"type":"string","title":"当前page对应的链接路径"},"enabled":{"type":"boolean","title":"是否激活状态"},"fields":{"type":"array","title":"字段配置","items":{"type":"object","title":"字段","properties":{"none":{"type":"object","title":"固定字段","properties":{"data": {"$ref":"#/definitions/data"}}}}}}}},"queueItem":{"type":"object","description":"下载的页面的链接信息","required":["url","_id"],"properties":{"_id":{"type":"string","title":"链接对应的md5"},"url":{"type":"string","title":"下载链接的详细地址"},"path":{"type":"string","title":"下载链接的路径"},"query":{"type":"string","title":"下载链接的地址栏参数信息"},"protocol":{"type":"string","title":"下载链接的协议"},"port":{"type":"number","title":"下载链接的端口"},"hostname":{"type":"string","title":"下载链接的域名","depth":{"type":"number","title":"下载链接深度"},"responseBody":{"type":"string","title":"html文档"}}}}}
测试数据
{"pages": [{"key": "main-123","path": "","areas": [],"fieldKey": "","fields": {"none": {"data": [{"key": "obj","selector": ["#ylNav2"],"dealStrategy": "object","data": [{"key": "title","selector": ["a.ylHdNavTt"],"methodInfo": { "text": [] },"dealStrategy": "normal"}, {"key": "title-1","selector": [".ylHdNavCon a"],"dealStrategy": "array","data": [{"key": "","selector": [],"methodInfo": { "text": [] },"dealStrategy": "normal"}]}]}, {"key": "cur-text","dealStrategy": "or","data": [{"selector": [".zsTobTabUl .cur a"],"methodInfo": { "text": [] },"dealStrategy": "normal"},{"selector": [".key_main .key_ul .hover"],"methodInfo": { "text": [] },"dealStrategy": "normal"}]}]}},"enabled": true}, {"key": "health-post","path": "/health/d+.shtml","areas": [],"fieldKey": "","fields": {"none": {"data": [{"key": "title","selector": ["#final_content .sfinal_w:eq(0) h1:eq(0)"],"removeSelector": [],"methodInfo": { "text": [] },"htmlStrategy": "jsdom","dealStrategy": "normal"}, {"key": "content","selector": ["#content_p"],"removeSelector": [],"methodInfo": { "html": [] },"htmlStrategy": "jsdom","dealStrategy": "normal"}]}},"enabled": true}, {"key": "main","path": "","areas": [],"fieldKey": "","fields": {"none": {"data": [{"key": "array","selector": ["#ylHnTime li"],"dealStrategy": "array","data": [{"key": "name","selector": ["a"],"methodInfo": { "text": [] },"dealStrategy": "normal"}]}, {"selector": ["#ylHnTime li"],"dealStrategy": "switch","data": [{"selector": "a","methodInfo": { "attr": ["title"] },"match": "0-1岁","data": [{"key": "switch","selector": ["a"],"formats": [{ "str": [] }],"methodInfo": { "text": [] }}],"dealStrategy": "case"}],}]}},"enabled": true}, {"key": "num","path": "","areas": [],"fieldKey": "","enabled": true,"fields": {"none": {"data": [{"key": "num","selector": ["#text_Keywords"],"dealStrategy": "normal","methodInfo": { "val": [] },"formats": [{ "key": "trim", "settings": { "start": true, "end": true, "mimddle": true } },{ "key": "regexp", "settings": { "regexp": "/\\d+/", "scope": "i", "index": 0 } },{ "key": "num" }]}]}}}],"queueItem": {"_id": "djlflds3opidu3ur","responseBody": "摇篮网首页的html","url": "http://www.yaolan.com","path": "/"}}