From: Dylan Lloyd Date: Mon, 21 Feb 2011 06:41:42 +0000 (-0500) Subject: BeautifulSoup is no longer a dependency! X-Git-Url: https://disinclined.org/git/?a=commitdiff_plain;h=9fc5c8cd64082ff27048b0ab0f1e04de77c009d0;p=i_like_pandora.git BeautifulSoup is no longer a dependency! The parsing code is now all wrapped up in search_classes.py. The two classes are inherited from HTMLParser and their objects are constructed with pandora_fetch(user) and seach_youtube(search_strings). YouTube has changed the way it formats its results pages, and the code now reflects the new layout. BeautifulSoup is no longer a dependency! HTMLParser is signifigantly faster than BeautifulSoup. While the true bottleneck is video download time, the change is still noticable. --- diff --git a/README b/README index 29d12db..c10f736 100644 --- a/README +++ b/README @@ -15,10 +15,6 @@ Requires youtube-dl to be installed. follow instruction here: http://rg3.github.com/youtube-dl/ Or use your favorite package-management system to install youtube-dl. Make sure youtube-dl is up to date if there are problems by running 'youtube-dl -U'. -Also requires the BeautifulSoup module for Python, which can be found here: -http://www.crummy.com/software/BeautifulSoup/download/3.x/BeautifulSoup-3.2.0.tar.gz -To install, after extracting the archive, `cd` into the directory and `python setup.py install` to install the module. - Notifications require Nofify OSD (https://launchpad.net/notify-osd), which ships with Ubuntu. See http://www.github.com/nospampleasemam/youtube_backup for more information and the latest version. diff --git a/doit.py b/doit.py deleted file mode 100755 index 616d2c9..0000000 --- a/doit.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python -from HTMLParser import HTMLParser -import urllib - -class pandora_fetch(HTMLParser): - - def __init__(self, user): - HTMLParser.__init__(self) - self.user = user - self.stations = [] - self.tracks = {} - self.__in_row = False - self.__in_track = False - self.__current_track = None - self.__mode = 'stations' - page = urllib.urlopen('http://www.pandora.com/favorites/profile_tablerows_station.vm?webname=' + self.user).read() - self.feed(page) - self.__mode = 'tracks' - for station in self.stations: - page = urllib.urlopen('http://www.pandora.com/favorites/station_tablerows_thumb_up.vm?token=' + station + '&sort_col=thumbsUpDate').read() - self.feed(page) - - def handle_starttag(self, tag, attrs): - if self.__mode == 'stations': - if tag == 'div': - for attr, value in attrs: - if attr == 'class' and value == 'station_table_row': - self.__in_row = True - continue - if self.__in_row and tag == 'a': - for attr, value in attrs: - if self.__in_row and attr == 'href': - self.stations.append(value[10:]) - continue - if self.__mode == 'tracks': - if tag == 'span': - for attr, value in attrs: - if attr == 'class' and value == 'track_title': - self.__in_track = True - continue - if attr == 'tracktitle': - self.__current_track = value - - def handle_data(self, text): - if self.__in_track: - self.tracks[self.__current_track] = text - - def handle_endtag(self, tag): - if tag == 'div': - self.__in_row = False - if tag == 'a': - self.__in_track = False - self.__current_track = None diff --git a/htmlparse.py b/htmlparse.py deleted file mode 100755 index 3447cd8..0000000 --- a/htmlparse.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python -from HTMLParser import HTMLParser -import urllib - -USER = 'alphabethos' - -class parse(HTMLParser): - - def __init__(self, data, mode): - HTMLParser.__init__(self) - print mode - self.__in_row = False - self.station_tokens = [] - self.feed(data) - - def handle_starttag(self, tag, attrs): - if tag == 'div': - for attr, value in attrs: - if attr == 'class' and value == 'station_table_row': - self.__in_row = True - continue - if tag == 'a': - for attr, value in attrs: - if attr == 'href' and self.__in_row: - self.station_tokens.append(value[10:]) - continue - - def handle_data(self, text): - pass - - def handle_endtag(self, tag): - self.__in_row = False - pass - - -page = urllib.urlopen('http://www.pandora.com/favorites/profile_tablerows_station.vm?webname=' + USER).read() -#page = urllib.urlopen('http://www.pandora.com/favorites/station_tablerows_thumb_up.vm?token=' + station + '&sort_col=thumbsUpDate') -#page = urllib.urlopen(search_url) -p = parse(page, 'stations') -p = parse(page, 'tracks') -print p.station_tokens diff --git a/likes_pandora.py b/likes_pandora.py index 92d6e07..3236496 100755 --- a/likes_pandora.py +++ b/likes_pandora.py @@ -36,7 +36,7 @@ except: print 'There is a formatting error in the configuration file at', CONFIG_FILE sys.exit() -from BeautifulSoup import BeautifulSoup +from search_classes import pandora_fetch, search_youtube, found_video import urllib import urllib2 import re @@ -49,19 +49,6 @@ if NOTIFICATIONS: import tempfile import string -def fetch_stations(user): - """ This takes a pandora username and returns the a list of the station tokens that the user is subscribed to. """ - stations = [] - page = urllib.urlopen('http://www.pandora.com/favorites/profile_tablerows_station.vm?webname=' + USER) - page = BeautifulSoup(page) - table = page.findAll('div', attrs={'class':'station_table_row'}) - for row in table: - if row.find('a'): - for attr, value in row.find('a').attrs: - if attr == 'href': - stations.append(value[10:]) - return stations - def fetch_tracks(stations): """ Takes a list of station tokens and returns a list of Title + Artist strings. """ @@ -90,26 +77,6 @@ def fetch_tracks(stations): pass return search_strings -def search_youtube(search_strings): - """ This takes a list of search strings and tries to find the first result. It returns a list of the youtube video ids of those results. - """ - video_list = [] - for search_string in search_strings: - search_url = 'http://youtube.com/results?search_query=' + urllib.quote_plus(search_string) - page = urllib.urlopen(search_url) - page = BeautifulSoup(page) - result = page.find('div', attrs={'class':'video-main-content'}) - if result == None: - print 'odd feedback for search, could not find div at ', search_url - continue - for attr, value in result.attrs: - if attr == 'id' and len(value[19:]) == 11: - video_list.append(value[19:]) - elif attr == 'id': - print 'odd feedback for search', search_url, " : ", value[19:] - return video_list - - def check_for_existing(video_list): """ Checks the download-folder for existing videos with same id and removes from video_list. """ filelist = os.listdir(DIR) @@ -166,12 +133,13 @@ def fetch_videos(video_list): note.show() def main(): - stations = fetch_stations(USER) - if len(stations) == 0: - print 'Are you sure your pandora profile is public? Can\'t seem to find any stations listed with your account.' - search_strings = fetch_tracks(stations) - videos = search_youtube(search_strings) - videos = check_for_existing(videos) + stations = pandora_fetch(USER) + searches = [] + for title, artist in stations.tracks.iteritems(): + search = title + " " + artist + searches.append(search) + videos = search_youtube(searches); + videos = check_for_existing(videos.track_ids) fetch_videos(videos) if __name__ == "__main__": diff --git a/search_classes.py b/search_classes.py new file mode 100755 index 0000000..9329018 --- /dev/null +++ b/search_classes.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python + +from HTMLParser import HTMLParser, HTMLParseError +import urllib +from fetch_pandora import pandora_fetch + +class pandora_fetch(HTMLParser): + """ This class should be initiated with a Pandora account username. It exposes a list of tracks `self.tracks` and a dictionary of title->artist pairs `tracks`. + """ + + def __init__(self, user): + HTMLParser.__init__(self) + self.user = user + self.stations = [] + self.tracks = {} + self.__in_row = False + self.__in_track = False + self.__current_track = None + self.__mode = 'stations' + page = urllib.urlopen('http://www.pandora.com/favorites/profile_tablerows_station.vm?webname=' + self.user).read() + self.feed(page) + self.__mode = 'tracks' + if len(self.stations) == 0: + print 'Are you sure your pandora profile is public? Can\'t seem to find any stations listed with your account.' + return 1 + for station in self.stations: + page = urllib.urlopen('http://www.pandora.com/favorites/station_tablerows_thumb_up.vm?token=' + station + '&sort_col=thumbsUpDate').read() + self.feed(page) + + def handle_starttag(self, tag, attrs): + if self.__mode == 'stations': + if tag == 'div': + for attr, value in attrs: + if attr == 'class' and value == 'station_table_row': + self.__in_row = True + continue + if self.__in_row and tag == 'a': + for attr, value in attrs: + if self.__in_row and attr == 'href': + self.stations.append(value[10:]) + continue + if self.__mode == 'tracks': + if tag == 'span': + for attr, value in attrs: + if attr == 'class' and value == 'track_title': + self.__in_track = True + continue + if attr == 'tracktitle': + self.__current_track = value + + def handle_data(self, text): + if self.__in_track: + self.tracks[self.__current_track] = text + + def handle_endtag(self, tag): + if tag == 'div': + self.__in_row = False + if tag == 'a': + self.__in_track = False + self.__current_track = None + + +class search_youtube(HTMLParser): + """ This class should be initiated with a list of search terms. It exposes a list of YouTube video ids `self.track_ids`. """ + + def __init__(self, search_terms): + self.track_ids = [] + for search in search_terms: + HTMLParser.__init__(self) + page = '' + self.__in_search_results = False + search = urllib.quote_plus(search) + url = 'http://youtube.com/results?search_query=' + connection = urllib.urlopen(url + search) + encoding = connection.headers.getparam('charset') + page = connection.read() + page = page.decode(encoding) + try: + self.feed(page) + except UnicodeDecodeError: + print 'problem decoding', url + search + except UnicodeEncodeError: + print 'problem encoding', url + search + except HTMLParseError: + # There is no way to override HTMLParseError and + # continue parsing, see: + # http://bugs.python.org/issue755660 + # But the data is there! + print 'problem parsing', url + search + except found_video: + pass + + def handle_starttag(self, tag, attrs): + if tag == 'div': + for attr, value in attrs: + if attr == 'id' and value == 'search-results': + self.__in_search_results = True + if self.__in_search_results: + for attr, value in attrs: + if attr == 'href' and value[:-11] == '/watch?v=' and len(value[9:]) == 11: + self.track_ids.append(value[9:]) + self.__in_search_results = False + #self.reset() + # Calling self.reset() causes the following error: + + # File "/usr/lib/python2.6/HTMLParser.py", line 108, in feed self.goahead(0) + # File "/usr/lib/python2.6/HTMLParser.py", line 148, in goahead k = self.parse_starttag(i) + # File "/usr/lib/python2.6/HTMLParser.py", line 229, in parse_starttag endpos = self.check_for_whole_start_tag(i) + # File "/usr/lib/python2.6/HTMLParser.py", line 305, in check_for_whole_start_tag + # raise AssertionError("we should not get here!") + + # I can't figure out why that's happening. I've + # discovered that calling HTMLParser.__init__(self) + # inside the search term loop in self.__init__ also + # resets the instance. The instance must be reset to + # accept a new page with self.feed(). Until a better + # solution is found: + raise found_video + +class found_video(BaseException): + """ Exception class to throw after finding a video to stop HTMLParser. """ + pass diff --git a/youtube.py b/youtube.py deleted file mode 100755 index 99381d4..0000000 --- a/youtube.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python - -from HTMLParser import HTMLParser, HTMLParseError -import urllib -#from doit import pandora_fetch -# -#USER = 'alphabethos' -#user_data = pandora_fetch(USER) -# -# -#searches = [] -#for title, artist in user_data.tracks.iteritems(): - #search = title + " " + artist - #searches.append(search) - -searches = ['Sugarcube Yo La Tengo', "Runnin' [Philippians Rmx] The Pharcyde", 'Take Me Out Franz Ferdinand', 'Our Remains Bitter:Sweet', 'Clan Primate Heys', 'In Remembrance Exile', 'Point Of No Return Dead Moon', "What's Golden Jurassic 5", "If It Wasn't For You (feat. De La Soul And Starchild Excalibur) Handsome Boy Modeling School", '505 Arctic Monkeys', 'Sea Legs The Shins', 'Breathe Telepopmusik', 'Back 4 U (live) Jurassic 5', 'Cretin Hop The Ramones', 'The Modern Age The Strokes', 'Naive The Kooks', 'No Buses Arctic Monkeys', 'Concrete Schoolyard Jurassic 5', 'Cause = Time Broken Social Scene', 'Conquer ABK', 'Hip Hop Dead Prez', 'Electro Sixteen Benny Benassi', 'Nights Introlude (Radio Edit) Nightmares On Wax', "Don't Ask Me Ok Go", 'She Does Locksley', 'Storm Vibrations Guided By Voices', 'Change Mr. J. Medeiros', 'Soma The Strokes', 'Rampage EPMD', "Don't Make Me Wait Locksley", 'Love And Emotion [Instrumental] Benny Benassi', 'Teddy Picker Arctic Monkeys', 'Crazy Gnarls Barkley', "Spittin' Images Exile (Producer)", "Simply Amazin' - Steel Blazin' Exile", 'Cao J. Rawls', 'Show Me Love Laidback Luke', 'Back Home (The Return) Common Market', '100% Dundee The Roots', 'Cigarette Smoker Fiona (Live) Arctic Monkeys', 'Someday The Strokes', 'Here It Goes Again Ok Go', 'Fell In Love With A Girl The White Stripes', 'Mr. Brown Styles Of Beyond', 'Stacie Anne The Fratellis', 'The Uh-Huh The Pharcyde', u'Get Up (D.O.N.S. & DBN Remix) Niki Belucci', 'Anti-Matter King Geedorah', 'Wait For Me The Pigeon Detectives', 'Around The World/Harder Better Faster Stronger (Live) Daft Punk', 'Loud Pipes Ratatat', "Ain't No Rest For The Wicked Cage The Elephant", 'Local Boy The Rifles', 'The Way It Is The Strokes', 'Heart In A Cage The Strokes', 'Blu Collar Worker Exile', 'What You Want The Roots', 'Be Still (Extended Mix) Kaskade', 'Kill The Director The Wombats', "Why Don't You Gramophonedzie", 'Only One Chris Lake', u'Come Fly Away (Soha & Adam K Remix) Benny Benassi', 'Cry For You (Radio Mix) September', 'Not Alone (Deadmau5 Instrumental) Gianluca Motta', 'Cigarette Smoker Fiona Arctic Monkeys', 'D Is For Dangerous Arctic Monkeys', 'Small Town Girl Good Shoes', 'Rapp Snitch Knishes (Mr. Fantastik) MF Doom', "93 'til Infinity Souls Of Mischief", 'Call On Me (Eric Prydz Vs Retarded Funk Remix) Eric Prydz', 'Perhaps Vampires Is A Bit Strong But.. Arctic Monkeys', 'Us And Them Pink Floyd', 'Weight Of The World Pigeon John', "Don't Look Back Telepopmusik", 'Get Over It Ok Go', 'Get What I Want Bitter:Sweet', "We're A Happy Family The Ramones", 'Fight For You Morgan Page', 'Dancing In The Rain Exile', 'Jerk It Out Caesars', 'Automatic Stop The Strokes', 'Nas Is Like Nas', 'Sheena Is A Punk Rocker The Ramones', 'Too Long/Steam Machine (Live) Daft Punk', 'Dance To My Ministry Brand Nubian', 'Tell Me Why (Radio Edit) Supermode', 'Trying Your Luck The Strokes', 'Chelsea Dagger The Fratellis', "Sinnerman (Felix Da Housecat's Heavenly House Mix) Nina Simone", 'The Choice Is Yours (Revisited) Black Sheep', 'All That You Are The Foreign Exchange', 'Old Yellow Bricks Arctic Monkeys', 'Alive With The Glory Of Love Say Anything', 'Work the Angles Dilated Peoples', 'Two And Two Talib Kweli', 'Last Hour Elliott Smith', "All These Things That I've Done The Killers", 'Romantic Type The Pigeon Detectives', 'Turn Off The Radio Dead Prez', 'Kick In The Door The Notorious B.I.G.', 'Sonic Reducer The Dead Boys', 'One Beer MF Doom', 'Go That Deep (Skylark Vocal Remix) Nufrequency', 'Brooklyn Go Hard Jay-Z', 'I Remember Deadmau5', 'Feel Lonely Alex Monakhov', 'All That You Are (Remixes Blend) Nicolay', 'Diferente Gotan Project', "Runnin' [Philippians Rmx Instrumental] The Pharcyde", 'The Narrow Path (Instrumental) Blu', 'Riot Van Arctic Monkeys', 'Fine And Free Guru', 'Hip 2 The Skeme The Coup', 'Cana*T Control Myself The Pigeon Detectives', 'Hell On Earth (Front Lines) Mobb Deep', 'The World Is Yours Nas', 'I Bet You Look Good On The Dancefloor Arctic Monkeys', "I'll Be Your Man The Black Keys", 'Trouble Bitter:Sweet', 'Careful Television', 'Mastermind Deltron 3030', 'Glory Box Portishead', 'Pavadita Color Tango De Roberto Alvarez', 'Are You Gonna Be My Girl? Jet', 'World, Hold On Bob Sinclar', 'Incinerate Sonic Youth', 'Radio Freq Dead Prez', 'Anarchy In The UK (Live) Sex Pistols', 'All For You RJD2', 'Go It Alone Beck', 'Psychotic Girl The Black Keys', 'Dull Life Yeah Yeah Yeahs', 'Broke Up The Time The Futureheads', 'Leave Before The Lights Come On Arctic Monkeys', 'Children (Club Radio Edit) 4 Clubbers', 'Baditude (Original Club Mix) Obernik', 'Topographic Darkleaf', 'Miami 2 Ibiza (Instrumental) Swedish House Mafia', 'The Island Pt. II (Dusk) Pendulum', 'WAR Little Brother', 'Meet Me Halfway (At The Remix) (Will.I.Am Remix) Black Eyed Peas', 'Milonga Astor Piazzolla', 'Heads Will Roll Yeah Yeah Yeahs', "I Don't Know Badi", 'Forest Whitiker Brother Ali', 'Perfect Moments (Official Airport Jam (Radio Edit) Yep', "Groovin' Kero One", 'Da Hype Junior Jack', 'I Used To Love H.E.R. Common', 'All Over Again Locksley', 'Close Edge Mos Def', 'The Longest Road Morgan Page', 'Migraine Headache (feat. ICP) Esham', 'Bittersweet Faith Bitter:Sweet', 'Deep Fried Frenz MF Doom', 'This House Is A Circus Arctic Monkeys', 'Red Light The Strokes', 'Rain (Cosmic Gate Remix) Armin Van Buuren', 'Dona*T Know How To Say Goodbye The Pigeon Detectives', "Roadkill (Edx's Alcapulco At Night Remix) Dubfire", 'Halftime Nas', 'Apes From Space (Dirtyloud Remix) Aaren San', 'Bigger Boys And Stolen Sweethearts Arctic Monkeys', 'The Narrow Path Blu', "Chillin' Modjo", 'Below The Heavens Pt. II Exile', 'Police On My Back The Clash', 'The Scene Is Dead We Are Scientists', "Coastin' Zion I", 'Bulletproof (Live At Shepherds Bush Empire, London) La Roux', 'Lost In The Post The Wombats', 'Whatever Lola Wants (Gotan Project Remix) Sarah Vaughan', 'Calabria 2008 Enur', 'Kings County Ming + FS', 'Mr. Brightside The Killers', 'Last Nite The Strokes', 'No Chance Soulstice (Rap)', 'Lady Modjo', 'Be Healthy Dead Prez', 'Repeated Offender The Rifles', "You Can't Hide, You Can't Run Dilated Peoples", 'Wusgood Clutch Players', "Azzurra (It's Not The Same Version) Gui Boratto", 'Picture The Blakes', 'Skills Gang Starr', 'Ghostwriter RJD2', 'Go! Common', 'Heads Will Roll (A-Trak Dub Mix) Yeah Yeah Yeahs', 'Red House Jimi Hendrix', 'Poker Face (Jody Den Broeder Remix) Lady Gaga', 'Hate To Say I Told You So The Hives', 'The Who Hieroglyphics', 'Stormy Weather The Kooks', 'Desert Eagle Ratatat', "You Probably Couldn't See For The Lights But You Were Staring Straight At Me Arctic Monkeys", 'All Men Are Freezing Robert Pollard', 'For The Girl The Fratellis', 'See No Evil Television', 'Aerodynamic (Daft Punk Remix) Daft Punk', 'Sandstorm Darude', 'White Knight Two Surkin', 'Hang Me Up To Dry Cold War Kids', 'Burning The Whitest Boy Alive', 'Mistress Mabel The Fratellis', 'Rite Where U Stand Gang Starr', 'YGM Atmosphere', 'One Swedish House Mafia', 'Buttons (Markus Schulz Vocal Mix) Sia', "I Don't Care Black Flag", 'One More Time/Aerodynamic (Live) Daft Punk', 'You Wish Nightmares On Wax', 'Meet Me Halfway (DJ Ammo/Poet Named Life Remix) Black Eyed Peas', 'Heartbeats (Extended Mix) Grum'] - -print 'starting with ', len(searches), 'searches.' - -class search_youtube(HTMLParser): - - def __init__(self, search_terms): - self.track_ids = [] - for search in search_terms: - HTMLParser.__init__(self) - page = '' - self.__in_search_results = False - search = urllib.quote_plus(search) - url = 'http://youtube.com/results?search_query=' - connection = urllib.urlopen(url + search) - encoding = connection.headers.getparam('charset') - page = connection.read() - page = page.decode(encoding) - try: - self.feed(page) - except UnicodeDecodeError: - print 'problem decoding', url + search - except UnicodeEncodeError: - print 'problem encoding', url + search - except HTMLParseError: - # There is no way to override HTMLParseError and - # continue parsing, see: - # http://bugs.python.org/issue755660 - # But the data is there! - print 'problem parsing', url + search - except found_video: - pass - - def handle_starttag(self, tag, attrs): - if tag == 'div': - for attr, value in attrs: - if attr == 'id' and value == 'search-results': - self.__in_search_results = True - if self.__in_search_results: - for attr, value in attrs: - if attr == 'href' and value[:-11] == '/watch?v=' and len(value[9:]) == 11: - self.track_ids.append(value[9:]) - self.__in_search_results = False - #self.reset() - # Calling self.reset() causes the following error: - - # File "/usr/lib/python2.6/HTMLParser.py", line 108, in feed self.goahead(0) - # File "/usr/lib/python2.6/HTMLParser.py", line 148, in goahead k = self.parse_starttag(i) - # File "/usr/lib/python2.6/HTMLParser.py", line 229, in parse_starttag endpos = self.check_for_whole_start_tag(i) - # File "/usr/lib/python2.6/HTMLParser.py", line 305, in check_for_whole_start_tag - # raise AssertionError("we should not get here!") - - # I can't figure out why that's happening. I've - # discovered that calling HTMLParser.__init__(self) - # inside the search term loop in self.__init__ also - # resets the instance. The instance must be reset to - # accept a new page with self.feed(). Until a better - # solution is found: - raise found_video - -class found_video(BaseException): - """ Exception class to throw after finding a video to stop HTMLParser. """ - pass - - -#results = search_youtube(searches) -#print results.track_ids