ancor_extracor.py
import urllib
from sgmllib import SGMLParser
class URLLister(SGMLParser):
    def reset(self):                              
        SGMLParser.reset(self)
        self.urls = []
        
    def start_a(self, attrs):                     
        
        href = [v for k, v in attrs if k=='href'] 
        if href:
            self.urls.extend(href)
            
            
if __name__ == "__main__":
    page = urllib.urlopen("/lehre/ss08/web-mining/uebungen.html")
    
    extractor = URLLister()
    extractor.feed(page.read())
    extractor.close()
    
    for u in extractor.urls:
        pass
        print u
            
         
                         
  
 
            
 +49 6151 16-21811
+49 6151 16-21811 +49 6151 16-
 +49 6151 16- 
 
 Drucken
 Drucken Impressum
 Impressum Sitemap
 Sitemap Suche
 Suche
