From: Dylan Lloyd Date: Tue, 25 Jan 2011 02:24:06 +0000 (-0500) Subject: Rewriting the parsing code to use HTMLParser X-Git-Url: https://disinclined.org/git/?a=commitdiff_plain;h=f365c39a36f222266a8140d702dcd4c6549a6431;p=i_like_pandora.git Rewriting the parsing code to use HTMLParser HTMLParser is a standard library and should be faster too, although I'd imagine the biggest bottleneck is fetching the files with urllib. To keep things simple, I just made a new file for now called htmlparse.py to work on it, I'll probably put them together later. --- diff --git a/htmlparse.py b/htmlparse.py new file mode 100755 index 0000000..3447cd8 --- /dev/null +++ b/htmlparse.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +from HTMLParser import HTMLParser +import urllib + +USER = 'alphabethos' + +class parse(HTMLParser): + + def __init__(self, data, mode): + HTMLParser.__init__(self) + print mode + self.__in_row = False + self.station_tokens = [] + self.feed(data) + + def handle_starttag(self, tag, attrs): + if tag == 'div': + for attr, value in attrs: + if attr == 'class' and value == 'station_table_row': + self.__in_row = True + continue + if tag == 'a': + for attr, value in attrs: + if attr == 'href' and self.__in_row: + self.station_tokens.append(value[10:]) + continue + + def handle_data(self, text): + pass + + def handle_endtag(self, tag): + self.__in_row = False + pass + + +page = urllib.urlopen('http://www.pandora.com/favorites/profile_tablerows_station.vm?webname=' + USER).read() +#page = urllib.urlopen('http://www.pandora.com/favorites/station_tablerows_thumb_up.vm?token=' + station + '&sort_col=thumbsUpDate') +#page = urllib.urlopen(search_url) +p = parse(page, 'stations') +p = parse(page, 'tracks') +print p.station_tokens