#!/usr/bin/python import sys, urllib, re, time lastURL = '' def getRandomPage() : global lastURL latest = urllib.urlopen('http://random.yahoo.com/fast/ryl').read() while latest == lastURL : latest = urllib.urlopen('http://random.yahoo.com/fast/ryl').read() time.sleep(1) lastURL = latest return latest def extractURL(inpage) : print inpage return re.search('http://.*/?',inpage).group() ### store the page contents in a file with the domain name. def writeOut (url) : try : fname = re.sub('http://','',url) fname = re.sub('/','-',fname) f = file(fname, 'w') f.write(urllib.urlopen(url).read()) except : print 'error writing' pass if __name__ == '__main__' : try : nurls = int(sys.argv[1]) except : print 'usage: corpusFetcher ' sys.exit(0) urllist = [writeOut(extractURL(getRandomPage())) for i in range(0,nurls)]