ancor_extracor.py
import urllib from sgmllib import SGMLParser class URLLister(SGMLParser): def reset(self): SGMLParser.reset(self) self.urls = [] def start_a(self, attrs): href = [v for k, v in attrs if k=='href'] if href: self.urls.extend(href) if __name__ == "__main__": page = urllib.urlopen("/lehre/ss08/web-mining/uebungen.html") extractor = URLLister() extractor.feed(page.read()) extractor.close() for u in extractor.urls: pass print u