import os import sys import lxml import pytz import StringIO import datetime import requests import functools import requests.exceptions from operator import itemgetter from lxml import etree from lxml.html.clean import Cleaner namespaces = { 'openSearch': "http://a9.com/-/spec/opensearchrss/1.0/", 'blogger': "http://schemas.google.com/blogger/2008", 'rdf': "http://www.w3.org/1999/02/22-rdf-syntax-ns#", 'slash': "http://purl.org/rss/1.0/modules/slash/", 'content': "http://purl.org/rss/1.0/modules/content/", 'taxo': "http://purl.org/rss/1.0/modules/taxonomy/", 'dc': "http://purl.org/dc/elements/1.1/", 'syn': "http://purl.org/rss/1.0/modules/syndication/", 'admin': "http://webns.net/mvcb/", 'feedburner': "http://rssnamespace.org/feedburner/ext/1.0", 'wfw': "http://wellformedweb.org/CommentAPI/", 'dc': "http://purl.org/dc/elements/1.1/", 'atom': "http://www.w3.org/2005/Atom", 'sy': "http://purl.org/rss/1.0/modules/syndication/", 'slash': "http://purl.org/rss/1.0/modules/slash/", 'atom': "http://www.w3.org/2005/Atom", 'content': "http://purl.org/rss/1.0/modules/content/", 'media': "http://search.yahoo.com/mrss/", } from email.utils import parsedate_tz, mktime_tz class feed_reader: """parse a list of feeds and return details as dictionary data""" #create the html cleaner, this is to clean out unwanted html tags in the description text #page_structure=True,remove_unknown_tags=True html_cleaner = Cleaner() html_cleaner.javascript = True html_cleaner.style = True html_cleaner.remove_tags = ['script', 'iframe', 'link', 'style', 'img', 'div'] #~ html_cleaner.allow_tags = ['a', 'p', 'strong'] filter_by_date_expire = datetime.datetime.now() - datetime.timedelta(days=int(1.5*365)) # 1 and a half years ago html_img_cleaner = Cleaner(allow_tags=['img'], remove_unknown_tags=False) html_img_cleaner.allow_tags = ['img'] html_parser = lxml.etree.HTMLParser() xml_parser = lxml.etree.XMLParser(remove_blank_text=True, ns_clean=True, encoding='utf-8') enable_date_filter = True def __init__(self, feed_details, timeout=5): self.results = {} for feed_info in feed_details: self.url = feed_info.get('url') self.author = feed_info.get('author') self.tags = feed_info.get('tags') if feed_info.get('url').startswith('http:'): try: response = requests.get(feed_info.get('url'), stream=True, timeout=timeout) except requests.exceptions.Timeout as e: continue if response.headers.get('content-encoding') == 'gzip': response.raw.read = functools.partial(response.raw.read, decode_content=True) try: self.feed = lxml.etree.parse(response.raw, self.xml_parser) except: continue else: with open(os.path.abspath(feed_info.get('url')), 'r') as file_stream: try: self.feed = lxml.etree.parse(file_stream, self.xml_parser) except: continue self.feed = self.feed.getroot() # rss feed defaults self.channel_image = self.fetch_node_text(self.feed, 'channel/image/url', '') self.parse_feed() def convert_rfc822_to_datetime(self, rfcdate): """rss uses rfc822 dates so lets convert them to datetime for use later""" if len(rfcdate): parsed_rfcdate = parsedate_tz(rfcdate) if not parsed_rfcdate: return None return datetime.datetime.fromtimestamp( mktime_tz(parsed_rfcdate), pytz.utc).replace(tzinfo=None) return None def clean_up_text(self, text): """strip out any dirty tags like