import os import sys import lxml import pytz import StringIO import datetime import requests import functools import requests.exceptions from lxml.html.clean import Cleaner namespaces = { 'openSearch': "http://a9.com/-/spec/opensearchrss/1.0/", 'blogger': "http://schemas.google.com/blogger/2008", 'rdf': "http://www.w3.org/1999/02/22-rdf-syntax-ns#", 'slash': "http://purl.org/rss/1.0/modules/slash/", 'content': "http://purl.org/rss/1.0/modules/content/", 'taxo': "http://purl.org/rss/1.0/modules/taxonomy/", 'dc': "http://purl.org/dc/elements/1.1/", 'syn': "http://purl.org/rss/1.0/modules/syndication/", 'admin': "http://webns.net/mvcb/", 'feedburner': "http://rssnamespace.org/feedburner/ext/1.0", 'wfw': "http://wellformedweb.org/CommentAPI/", 'dc': "http://purl.org/dc/elements/1.1/", 'atom': "http://www.w3.org/2005/Atom", 'sy': "http://purl.org/rss/1.0/modules/syndication/", 'slash': "http://purl.org/rss/1.0/modules/slash/", 'atom': "http://www.w3.org/2005/Atom", 'content': "http://purl.org/rss/1.0/modules/content/", 'media': "http://search.yahoo.com/mrss/", } from email.utils import parsedate_tz, mktime_tz class feed_reader: """parse a list of feeds and return details as dictionary data""" #create the html cleaner, this is to clean out unwanted html tags in the description text html_cleaner = Cleaner() html_cleaner.javascript = True html_cleaner.style = True html_cleaner.remove_tags = ['script', 'iframe', 'link', 'style', 'img'] filter_by_date = datetime.datetime.now() - datetime.timedelta(days=int(1.5*365)) # 1 and a half years ago html_img_cleaner = Cleaner(allow_tags=['img'], remove_unknown_tags=False) html_img_cleaner.allow_tags = ['img'] html_parser = lxml.etree.HTMLParser() xml_parser = lxml.etree.XMLParser(remove_blank_text=True, ns_clean=True, encoding='utf-8') def __init__(self, feed_details, timeout=5): self.results = {} for feed_info in feed_details: self.url = feed_info.get('url') self.author = feed_info.get('author') self.tags = feed_info.get('tags') if feed_info.get('url').startswith('http:'): response = requests.get(feed_info.get('url'), stream=True, timeout=timeout) if response.headers.get('content-encoding') == 'gzip': response.raw.read = functools.partial(response.raw.read, decode_content=True) try: self.feed = lxml.etree.parse(response.raw, self.xml_parser) except: continue else: with open(os.path.abspath(feed_info.get('url')), 'r') as file_stream: try: self.feed = lxml.etree.parse(file_stream, self.xml_parser) except: continue self.feed = self.feed.getroot() # rss feed defaults self.channel_image = self.fetch_node_text(self.feed, 'channel/image/url', '') self.parse_feed() def convert_rfc822_to_datetime(self, rfcdate): """rss uses rfc822 dates so lets convert them to datetime for use later""" if len(rfcdate): parsed_rfcdate = parsedate_tz(rfcdate) if not parsed_rfcdate: return None return datetime.datetime.fromtimestamp( mktime_tz(parsed_rfcdate), pytz.utc).replace(tzinfo=None) return None def clean_up_text(self, text): """strip out any dirty tags like