爬虫链接模块
用于发现html文档中的地址链接
当前支持的模式
- 分析页面
partten: role:crawler.plugin.queue,cmd:analyze 测试地址:http://172.16.112.215:9003/act POST
返回数据结构:
{"type":"array","description":"返回的数据结构","items":{"type":"object","properties":{"url":{"type":"string","title":"下载链接的详细地址"},"path":{"type":"string","title":"下载链接的路径"},"query":{"type":"string","title":"下载链接的地址栏参数信息"},"protocol":{"type":"string","title":"下载链接的协议"},"port":{"type":"number","title":"下载链接的端口"},"hostname":{"type":"string","title":"下载链接的域名","depth":{"type":"number","title":"下载链接深度"},}}}
参数数据结构:
{"type":"object","description":"参数","required":["queueItem"],"properties":{"discoverConfig":{"type":"object","description":"地址过滤配置","properties":{"parseHTMLComments":{"type":"boolean","title":"是否查找注释中的地址"},"parseScriptTags":{"type":"boolean","title":"是否查找script标签中的地址"},"ignoreRobots":{"type":"boolean","title":"是否忽略机器人应答"},"maxDepth":{"type":"number","title":"最大深度"},"fetchWhitelistedMimeTypesBelowMaxDepth":{"type":"boolean","title":"是否开启最大深度过滤"},"whitePathList":{"type":"array","title":"路径白名单","items":{"type":"object""title":"单个路径规则","properties":{"path":{"type":"string","title":"规则,支持正则"},"enable":{"type":"boolean","title":"是否开启"}}}},"allowedProtocols":{"type":"array","title":"支持的协议","items":{"type":"string"}}}},"queueConfig":{"type":"object","description":"域名过滤配置","properties":{"ignoreWWWDomain":{"type":"boolean","title":"忽略www的域名"},"stripWWWDomain":{"type":"boolean","title":"是否去掉www域名,使用根域名"},"scanSubdomains":{"type":"boolean","title":"搜索子域名"},"host":{"type":"string","title":"初始host"},"initialProtocol":{"type":"string","title":"初始协议"},"initialPort":{"type":"number","title":"初始端口","stripQuerystring":{"type":"boolean","title":"去掉地址栏后面的query参数"},"allowQueryParams":{"type":"array","title":"过滤query参数,选择需要的","items":{"type":"string","title":"单个参数"}},"domainWhiteList":{"type":"array","title":"域名白名单,支持正则","items":{"type":"string","title":"单个域名规则"}},"filterByDomain":{"type":"boolean","title":"是否开启过滤域名白名单"}}},"queueItem":{"type":"object","description":"下载的页面的链接信息","required":["url","responseBody"],"properties":{"url":{"type":"string","title":"下载链接的详细地址"},"path":{"type":"string","title":"下载链接的路径"},"query":{"type":"string","title":"下载链接的地址栏参数信息"},"protocol":{"type":"string","title":"下载链接的协议"},"port":{"type":"number","title":"下载链接的端口"},"hostname":{"type":"string","title":"下载链接的域名","depth":{"type":"number","title":"下载链接深度"},"responseBody":{"type":"string","title":"html文档"}}}}}
测试数据
{"queueConfig": {"ignoreWWWDomain": false,"stripWWWDomain": false,"scanSubdomains": true,"host": "www.yaolan.com","initialProtocol": "http","initialPort": 80,"stripQuerystring": true,"fetchConditions": [],"domainWhiteList": ["(.*?).yaolan.com"],"filterByDomain": true},"discoverConfig": {"parseHTMLComments": false,"parseScriptTags": false,"allowedProtocols": ["http", "https"],"whitePathList": [{ "path": "/(.*?)", "enable": true }],"userAgent": "","fetchWhitelistedMimeTypesBelowMaxDepth": false,"maxDepth": 0,"ignoreRobots": true},"queueItem": {"responseBody": "摇篮网首页的html代码","url": "http://www.yaolan.com"}}
- 链接规范化
partten: role:crawler.plugin.queue,cmd:queue
返回数据结构:
{"type":"array","description":"返回的数据结构","items":{"type":"object","properties":{"url":{"type":"string","title":"下载链接的详细地址"},"path":{"type":"string","title":"下载链接的路径"},"query":{"type":"string","title":"下载链接的地址栏参数信息"},"protocol":{"type":"string","title":"下载链接的协议"},"port":{"type":"number","title":"下载链接的端口"},"hostname":{"type":"string","title":"下载链接的域名","depth":{"type":"number","title":"下载链接深度"},}}}
参数数据结构:
{"type":"object","description":"参数","required":["queueItem"],"properties":{"queueConfig":{"type":"object","description":"域名过滤配置","properties":{"ignoreWWWDomain":{"type":"boolean","title":"忽略www的域名"},"stripWWWDomain":{"type":"boolean","title":"是否去掉www域名,使用根域名"},"scanSubdomains":{"type":"boolean","title":"搜索子域名"},"host":{"type":"string","title":"初始host"},"initialProtocol":{"type":"string","title":"初始协议"},"initialPort":{"type":"number","title":"初始端口","stripQuerystring":{"type":"boolean","title":"去掉地址栏后面的query参数"},"allowQueryParams":{"type":"array","title":"过滤query参数,选择需要的","items":{"type":"string","title":"单个参数"}},"domainWhiteList":{"type":"array","title":"域名白名单,支持正则","items":{"type":"string","title":"单个域名规则"}},"filterByDomain":{"type":"boolean","title":"是否开启过滤域名白名单"}}},"urls":{"type":"array","description":"下载的页面的链接信息","items":{"type":"string","title":"链接"}}}}
测试数据
{"queueConfig": {"ignoreWWWDomain": false,"stripWWWDomain": false,"scanSubdomains": true,"host": "www.yaolan.com","initialProtocol": "http","initialPort": 80,"stripQuerystring": true,"fetchConditions": [],"domainWhiteList": ["(.*?).yaolan.com"],"filterByDomain": true},"urls": ["http://www.yaolan.com","http://bbs.yaolan.com"]}