from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' from calibre.web.feeds.news import BasicNewsRecipe import string class TheHindu(BasicNewsRecipe): title = u'The Hindu - Customized' language = 'en_IN' oldest_article = 1 __author__ = 'Ankurs Calibre' max_articles_per_feed = 100 no_stylesheets = True auto_cleanup = True extra_css = '.photo-caption { font-size: smaller }' def parse_index(self): soup = self.index_to_soup('http://www.thehindu.com/todays-paper/') nav_div = soup.find(id='tpnav-bar') section_list = [] # Finding all the section titles that are acceptable for x in nav_div.findAll(['a']): if self.is_accepted_entry(x): section_list.append( (string.capwords(self.tag_to_string(x)), x['href'])) # For each section title, fetch the article urls feeds = [] for section in section_list: section_title = section[0] section_url = section[1] soup = self.index_to_soup(section_url) current_articles = [] div = soup.find('div', attrs={'id': 'left-column'}) soup.find('span', attrs={'class': 'newsection-title'}).extract() soup.find('div', attrs={'id': 'tpnav-bar'}).extract() for x in div.findAll(['a']): title = self.tag_to_string(x) url = x.get('href', False) if not url or not title: continue self.log('\t\tFound article:', title) self.log('\t\t\t', url) current_articles.append({'title': title, 'url': url, 'description': '', 'date': ''}) feeds.append((section_title, current_articles)) return feeds def is_accepted_entry(self, entry): # Those sections in the top nav bar that we will omit
omit_list = ['tp-tamilnadu', 'tp-karnataka', 'tp-telangana', 'tp-kerala', 'tp-andhrapradesh', 'tp-in-school', 'tp-cinemaplus', 'tp-propertyplus', 'tp-downtown', 'tp-agriculture', 'tp-openpage'] is_accepted = True for omit_entry in omit_list: if entry['href'][0:-1].endswith(omit_entry): is_accepted = False break return is_accepted