ancor_extracor.py

import urllib
from sgmllib import SGMLParser


class URLLister(SGMLParser):
    def reset(self):                              
        SGMLParser.reset(self)
        self.urls = []
        
    def start_a(self, attrs):                     
        
        href = [v for k, v in attrs if k=='href'] 
        if href:
            self.urls.extend(href)
            
            
if __name__ == "__main__":

    page = urllib.urlopen("/lehre/ss08/web-mining/uebungen.html")
    
    extractor = URLLister()
    extractor.feed(page.read())
    extractor.close()
    
    for u in extractor.urls:
        pass
        print u
Kontakt

small ke-icon

Knowledge Engineering Group

Fachbereich Informatik
TU Darmstadt

S2|02 D203
Hochschulstrasse 10

D-64289 Darmstadt

Sekretariat:
Telefon-Symbol+49 6151 16-21811
Fax-Symbol +49 6151 16-21812
E-Mail-Symbol info@ke.tu-darmstadt.de

 
A A A | Drucken | Impressum | Sitemap | Suche | Mobile Version
zum Seitenanfangzum Seitenanfang