From f365c39a36f222266a8140d702dcd4c6549a6431 Mon Sep 17 00:00:00 2001 From: Dylan Lloyd Date: Mon, 24 Jan 2011 21:24:06 -0500 Subject: [PATCH] Rewriting the parsing code to use HTMLParser HTMLParser is a standard library and should be faster too, although I'd imagine the biggest bottleneck is fetching the files with urllib. To keep things simple, I just made a new file for now called htmlparse.py to work on it, I'll probably put them together later. --- htmlparse.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100755 htmlparse.py diff --git a/htmlparse.py b/htmlparse.py new file mode 100755 index 0000000..3447cd8 --- /dev/null +++ b/htmlparse.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +from HTMLParser import HTMLParser +import urllib + +USER = 'alphabethos' + +class parse(HTMLParser): + + def __init__(self, data, mode): + HTMLParser.__init__(self) + print mode + self.__in_row = False + self.station_tokens = [] + self.feed(data) + + def handle_starttag(self, tag, attrs): + if tag == 'div': + for attr, value in attrs: + if attr == 'class' and value == 'station_table_row': + self.__in_row = True + continue + if tag == 'a': + for attr, value in attrs: + if attr == 'href' and self.__in_row: + self.station_tokens.append(value[10:]) + continue + + def handle_data(self, text): + pass + + def handle_endtag(self, tag): + self.__in_row = False + pass + + +page = urllib.urlopen('http://www.pandora.com/favorites/profile_tablerows_station.vm?webname=' + USER).read() +#page = urllib.urlopen('http://www.pandora.com/favorites/station_tablerows_thumb_up.vm?token=' + station + '&sort_col=thumbsUpDate') +#page = urllib.urlopen(search_url) +p = parse(page, 'stations') +p = parse(page, 'tracks') +print p.station_tokens -- 2.30.2