import urllib2, re, string from sgmllib import SGMLParser stopwords = ['I', 'a', 'an', 'as', 'at', 'by', 'he', 'his', 'me', 'or', 'thou', 'us', 'who', 'against', 'amid', 'amidst', 'among', 'amongst', 'and', 'anybody', 'anyone', 'because', 'beside', 'circa', 'despite', 'during', 'everybody', 'everyone', 'for', 'from', 'her', 'hers', 'herself', 'him', 'himself', 'hisself', 'idem', 'if', 'into', 'it', 'its', 'itself', 'myself', 'nor', 'of', 'oneself', 'onto', 'our', 'ourself', 'ourselves', 'per', 'she', 'since', 'than', 'that', 'the', 'thee', 'theirs', 'them', 'themselves', 'they', 'thine', 'this', 'thyself', 'to', 'tother', 'toward', 'towards', 'unless', 'until', 'upon', 'versus', 'via', 'we', 'what', 'whatall', 'whereas', 'which', 'whichever', 'whichsoever', 'whoever', 'whom', 'whomever', 'whomso', 'whomsoever', 'whose', 'whosoever', 'with', 'without', 'ye', 'you', 'you-all', 'yours', 'yourself', 'yourselves', 'aboard', 'about', 'above', 'across', 'after', 'all', 'along', 'alongside', 'although', 'another', 'anti', 'any', 'anything', 'around', 'astride', 'aught', 'bar', 'barring', 'before', 'behind', 'below', 'beneath', 'besides', 'between', 'beyond', 'both', 'but', 'concerning', 'considering', 'down', 'each', 'either', 'enough', 'except', 'excepting', 'excluding', 'few', 'fewer', 'following', 'ilk', 'in', 'including', 'inside', 'like', 'many', 'mine', 'minus', 'more', 'most', 'naught', 'near', 'neither', 'nobody', 'none', 'nothing', 'notwithstanding', 'off', 'on', 'opposite', 'other', 'otherwise', 'outside', 'over', 'own', 'past', 'pending', 'plus', 'regarding', 'round', 'save', 'self', 'several', 'so', 'some', 'somebody', 'someone', 'something', 'somewhat', 'such', 'suchlike', 'sundry', 'there', 'though', 'through', 'throughout', 'till', 'twain', 'under', 'underneath', 'unlike', 'up', 'various', 'vis-a-vis', 'whatever', 'whatsoever', 'when', 'wherewith', 'wherewithal', 'while', 'within', 'worth', 'yet', 'yon', 'yonder'] class Wikipage : def __init__(self, url) : self.url = url self.score = 0 pm = urllib2.HTTPPasswordMgrWithDefaultRealm() pm.add_password(None, 'https://scorpio.cs.usfca.edu', 'cs662', 'iloveAI') auth_handler = urllib2.HTTPBasicAuthHandler(pm) opener = urllib2.build_opener(auth_handler) # ...and install it globally so it can be used with urlopen. urllib2.install_opener(opener) self.content = urllib2.urlopen(url).read() ## now parse the page wp = WikipageProcessor() ## and get the content wp.feed(self.content) ## you add code to: strip out stopwords, remove non-words, ### extract links, but not edit links or external links. self.outwardLinks = [] def __le__(self, other) : return self.score >= other.score def __lt__(self, other) : return self.score > other.score def __cmp__(self, other) : return self.score == other.score class WikipageProcessor(SGMLParser) : def reset(self) : SGMLParser.reset(self) self.outwardLinks = [] self.words = [] def start_a(self, attrs) : href = [v for k,v in attrs if k=='href'] if href : self.outwardLinks.extend(href) def handle_data(self, text) : self.words.extend(text.split())