robots.txt parser for node.js

  • robotstxt is written in coffee script
  • robotstxt is currently alpha
  • robotstxt offers an easy way to obey the allow/disallow rules listed in the sites robots.txt


npm install robotstxt

all examples use coffee script syntax


robotsTxt = require 'robotstxt'

parse a robots.txt:

#robotsTxt(url, user_agent)
google_robots_txt = robotsTxt '', 'Mozilla/5.0 (compatible; Googlebot/2.1; +'

assign event handler after all that parsing is done

google_robots_txt.on 'ready', (gate_keeper) -> 
  #returns false
  #note: only the path and the query of the url gets parsed, everything else (i.e. http://, the domain-name) gets ditched
  console.log gate_keeper.isAllowed ''
  #returns false
  console.log gate_keeper.isAllowed '/setnewsprefs?sfsdfg'
  #returns true
  #note: only the path and the query of the url gets parsed, everything else (i.e. http://, the domain-name) gets ditched
  console.log gate_keeper.isDisallowed '' 
  #returns true
  console.log gate_keeper.isDisallowed '/setnewsprefs?sfsdfg'

gate_keeper methods:

#asks the gate_keeper if it's ok to crawle an url
isAllowed url
#asks the gate_keeper if it's not ok to crawle an url
isDisallowed url
#answeres the question, why an url is allowed/disallowed
why url
#if you want to change the user agent that is used for this question
setUserAgent user_agent
#if you want to know which robots.txt group is used with which user_agent
#per default uses the user agent set with setUserAgent
getGroup (user_agent)

robotsTxt methods

#fetches parses url with user_agent
#returns an robots_txt event emitter
robotsTxt(url, user_agent)

#blank robots_txt object
blank_robots_txt = robotsTxt()

#crawls and parses a robots.txt 
#throws an 'crawled' event
blank_robots_txt.crawl: (protocol, host, port, path,  user_agent, encoding)

#parses a txt string line after line
#throws a 'ready' event

robotsTxt events

#thrown after the whole robots.txt is crawled
robotsTxt.on 'crawled' (txt) -> ...

#thrown after all lines of the robots.txt are parsed
robotsTxt.on 'ready' (gate_keeper)

#if something did not quite work
#note: it's a good idea to always implement this error listener 
#to prevent strange looking error messages in case there are internet connection issues
robotsTxt.on 'error' (error)


the default user-agent used is

#robotsTxt(url, user_agent)
Mozilla/5.0 (compatible; Open-Source-Coffee-Script-Robots-Txt-Checker/2.1; +

i strongly recommend using your own user agent


myapp_robots_txt = robotsTxt '', 'Mozilla/5.0 (compatible; MyAppBot/2.1; +'

if you want to simulate another crawler (for testing purposes only, of course) see this list for the correct user agent strings

  • [List of User Agent Strings] (
  • [Googlebot] (


  • ready event should also pass a sitemaps_in_robots_txt object
  • sitemaps_in_robots_txt should offer methods to collect the urls listed in the sitemap