import time import wikipage import urllib2 from heapq import heappush, heappop base='https://scorpio.cs.usfca.edu/' class crawler : def __init__(self) : self.pages = [] self.queue = [] def getRandomPages(self, npages) : start = time.time() for i in range(npages) : self.pages.append(wikipage.Wikipage('https://scorpio.cs.usfca.edu/wiki/index.php/Special:Random')) ### add document frequency calculations here. end = time.time() print "%d pages collected in %d seconds" % (npages, end - start) def crawl(self, nGoalpages=10, ntotalPages=500, threshold=0.5, scorer=None, fractionLinksUsed=0.25, startURL="https://scorpio.cs.usfca.edu/wiki/index.php/Special:Random") : if not scorer : print 'Please provide a scorer.' return None ## add a closed list to this method pagesFound = 0 pagesCrawled = 0 startpage = wikipage.Wikipage(startURL) scorer.score(startpage) heappush(self.queue, startpage) while pagesFound < nGoalpages and pagesCrawled < ntotalPages: nextpage = heappop(self.queue) pagesCrawled += 1 print 'Fetching ', nextpage.url if nextpage.score > threshold : self.pages.append(nextpage) pagesFound += 1 for link in nextpage.outwardLinks[:int(fractionLinksUsed * len(nextpage.outwardLinks))] : print 'Extracted ', link pagesCrawled +=1 newpage = wikipage.Wikipage(base+link) newpage.score = scorer.score(newpage) heappush(self.queue, newpage) return self.pages