From: Dylan Lloyd Date: Wed, 26 Jan 2011 06:29:02 +0000 (-0500) Subject: Solved decode error, can't escape HTMLParseError X-Git-Url: https://disinclined.org/git/?a=commitdiff_plain;h=f333fc89b2013ead93c44e5c7d4ccc2f6b9fdd6e;p=i_like_pandora.git Solved decode error, can't escape HTMLParseError --- diff --git a/youtube.py b/youtube.py index 324b2c6..06ef7db 100755 --- a/youtube.py +++ b/youtube.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -from HTMLParser import HTMLParser +from HTMLParser import HTMLParser, HTMLParseError import urllib from doit import pandora_fetch @@ -21,9 +21,21 @@ class search_youtube(HTMLParser): for search in search_terms: self.__in_result = False search = urllib.quote_plus(search) - query = 'http://youtube.com/results?search_query=' - page = urllib.urlopen(query + search).read() - self.feed(page) + url = 'http://youtube.com/results?search_query=' + connection = urllib.urlopen(url + search) + encoding = connection.headers.getparam('charset') + page = connection.read() + page = page.decode(encoding) + try: + self.feed(page) + except UnicodeDecodeError: + print 'problem decoding', url + search + except HTMLParseError: + # There is no way to override HTMLParseError and + # continue parsing, see: + # http://bugs.python.org/issue755660 + # But the data is there! + print 'problem parsing', url + search def handle_starttag(self, tag, attrs): if tag == 'div': @@ -35,12 +47,8 @@ class search_youtube(HTMLParser): track_id = value if self.__in_result and len(track_id[19:]) == 11: self.track_ids.append(track_id[19:]) - print track_id[19:] self.__in_result = False - def handle_endtag(self, tag): - pass - - results = search_youtube(searches) +print results.track_ids