import os import sys import lxml import pytz import datetime import requests import functools import requests.exceptions #from lxml import etree, objectify from lxml.html.clean import Cleaner namespaces = { 'atom': "http://www.w3.org/2005/Atom", 'openSearch': "http://a9.com/-/spec/opensearchrss/1.0/", 'blogger': "http://schemas.google.com/blogger/2008", 'rdf': "http://www.w3.org/1999/02/22-rdf-syntax-ns#", 'slash': "http://purl.org/rss/1.0/modules/slash/", 'content': "http://purl.org/rss/1.0/modules/content/", 'taxo': "http://purl.org/rss/1.0/modules/taxonomy/", 'dc': "http://purl.org/dc/elements/1.1/", 'syn': "http://purl.org/rss/1.0/modules/syndication/", 'admin': "http://webns.net/mvcb/", 'feedburner': "http://rssnamespace.org/feedburner/ext/1.0", 'content': "http://purl.org/rss/1.0/modules/content/", 'wfw': "http://wellformedweb.org/CommentAPI/", 'dc': "http://purl.org/dc/elements/1.1/", 'atom': "http://www.w3.org/2005/Atom", 'sy': "http://purl.org/rss/1.0/modules/syndication/", 'slash': "http://purl.org/rss/1.0/modules/slash/" } #~ import zlib #~ #~ READ_BLOCK_SIZE = 1024 * 8 #~ def decompress_stream(fileobj): #~ result = StringIO() #~ #~ d = zlib.decompressobj(16 + zlib.MAX_WBITS) #~ for chunk in iter(partial(response.raw.read, READ_BLOCK_SIZE), ''): #~ result.write(d.decompress(chunk)) #~ #~ result.seek(0) #~ return result #~ parser = etree.XMLParser(remove_blank_text=True, ns_clean=True) #~ tree = etree.parse(metadata, parser) #~ root = tree.getroot() from email.utils import parsedate_tz, mktime_tz class feed_reader: #create the html cleaner, this is to clean out unwanted html tags in the description text html_cleaner = Cleaner() html_cleaner.javascript = True html_cleaner.style = True html_cleaner.remove_tags = ['script', 'iframe', 'link', 'style'] filter_by_date = datetime.datetime.now() - datetime.timedelta(days=int(1.5*365)) # 1 and a half years ago #html_cleaner.allow_tags = ['script', 'iframe', 'link', 'style'] #html_cleaner.kill_tags = ['script', 'iframe', 'link', 'style'] def __init__(self, feed_details, timeout=5): self.results = {} parser = lxml.etree.XMLParser(remove_blank_text=True, ns_clean=True, encoding='utf-8') for feed_info in feed_details: self.url = feed_info.get('url') self.author = feed_info.get('author') self.tags = feed_info.get('tags') if feed_info.get('url').startswith('http:'): response = requests.get(feed_info.get('url'), stream=True, timeout=timeout) if response.headers.get('content-encoding') == 'gzip': response.raw.read = functools.partial(response.raw.read, decode_content=True) self.feed = lxml.etree.parse(response.raw, parser) else: fp = open(feed_info.get('url'), 'r') self.feed = lxml.etree.parse(fp, parser) self.feed = self.feed.getroot() self.parse_feed() def convert_rfc822_to_datetime(self, rfcdate): if len(rfcdate): parsed_rfcdate = parsedate_tz( rfcdate ) if not parsed_rfcdate: return None return datetime.datetime.fromtimestamp( mktime_tz(parsed_rfcdate), pytz.utc ).replace(tzinfo=None) return None def clean_up_text(self, text): """strip out any dirty tags like