improve feed reader image parsing

This commit is contained in:
Oliver Marks 2015-06-15 18:08:38 +01:00
parent 4647a4d315
commit 27f3a968ca
35 changed files with 106 additions and 130 deletions

0
LICENSE Normal file → Executable file
View File

0
site/__init__.py Normal file → Executable file
View File

View File

@ -5,9 +5,9 @@ page_menu = [
('Contact', '#mailing-list-signup')] ('Contact', '#mailing-list-signup')]
banner_images = [ banner_images = [
('/static/template/images/hackspace-banner.png', 'http://maidstone-hackspace.org.uk/', 'title', 'intro text'), ('/static/template/images/hackspace-banner.png', '', 'title', 'intro text'),
('/static/template/images/example-01.jpg', 'http://www.google.co.uk', 'title', 'intro text'), ('/static/template/images/example-01.jpg', '', 'title', 'intro text'),
('/static/template/images/example-02.jpg', 'http://www.google.co.uk', 'title', 'intro text')] ('/static/template/images/example-02.jpg', '', 'title', 'intro text')]
tile_images = [ tile_images = [
('/static/template/images/tile-01.jpg',), ('/static/template/images/tile-01.jpg',),
@ -19,6 +19,7 @@ rss_feeds = [
{'author':'Simon Ridley', {'author':'Simon Ridley',
'url': 'http://waistcoatforensicator.blogspot.com/feeds/posts/default?alt=rss'}, 'url': 'http://waistcoatforensicator.blogspot.com/feeds/posts/default?alt=rss'},
{'author':'Mathew Beddow', 'tags': ['tech'], 'url': 'http://www.matthewbeddow.co.uk/?feed=rss2'}, {'author':'Mathew Beddow', 'tags': ['tech'], 'url': 'http://www.matthewbeddow.co.uk/?feed=rss2'},
#{'author':'Oliver Marks', 'url': 'http://www.digitaloctave.co.uk/rss.xml'},
{'author':'Mike McRoberts', 'url': 'http://thearduinoguy.org/?feed=rss2'}] {'author':'Mike McRoberts', 'url': 'http://thearduinoguy.org/?feed=rss2'}]
kent_hackspace = ['http://www.medwaymakers.co.uk/'] kent_hackspace = ['http://www.medwaymakers.co.uk/']

View File

@ -108,6 +108,7 @@ def blogs():
feed = feed_reader(site.rss_feeds) feed = feed_reader(site.rss_feeds)
for row in feed: for row in feed:
print row.get('image')
web.tiles.append( web.tiles.append(
title = row.get('title'), title = row.get('title'),
author = row.get('author'), author = row.get('author'),
@ -148,7 +149,7 @@ def index():
web.paragraph.append( web.paragraph.append(
"""We are in the process of developing Maidstone Hackspace. We're previous members of <span class="info" title="Innovation center medway prototype">(ICMP)</span> and looking to form a new space in the future. """We are in the process of developing Maidstone Hackspace. We're previous members of <span class="info" title="Innovation center medway prototype">(ICMP)</span> and looking to form a new space in the future.
At the moment, communication is via google groups, email, and the website. If you're at all intrested please join our <a href="#mailing-list">mailing list</a> At the moment, communication is via google groups, email, and the website. If you're at all intrested please join our <a href="#mailing-list-signup">mailing list</a>
and make yourself known!""") and make yourself known!""")
web.page.section(web.paragraph.render()) web.page.section(web.paragraph.render())
@ -160,7 +161,7 @@ def index():
bullet_list.append( bullet_list.append(
("""Build an interactive splash screen to feature on this site.""",)) ("""Build an interactive splash screen to feature on this site.""",))
bullet_list.append( bullet_list.append(
(web.link.create('Suggest a new activity', 'Suggest a new activity', '#mailing-list').render(),)) (web.link.create('Suggest a new activity', 'Suggest a new activity', '#mailing-list-signup').render(),))
web.list.create(ordered=False).set_classes('bullet-list') web.list.create(ordered=False).set_classes('bullet-list')
web.list * bullet_list web.list * bullet_list
@ -180,7 +181,7 @@ if __name__ == "__main__":
#~ args = parser.parse_args() #~ args = parser.parse_args()
#~ print(args.accumulate(args.integers)) #~ print(args.accumulate(args.integers))
with codecs.open('./index.html', 'w', "utf-8") as fp: with codecs.open('./html/index.html', 'w', "utf-8") as fp:
fp.write(index().decode('utf-8')) fp.write(index().decode('utf-8'))
#~ with open('./html/examples.html', 'w') as fp: #~ with open('./html/examples.html', 'w') as fp:
#~ fp.write(examples()) #~ fp.write(examples())

File diff suppressed because one or more lines are too long

View File

@ -2,17 +2,15 @@ import os
import sys import sys
import lxml import lxml
import pytz import pytz
import StringIO
import datetime import datetime
import requests import requests
import functools import functools
import requests.exceptions import requests.exceptions
#from lxml import etree, objectify
from lxml.html.clean import Cleaner from lxml.html.clean import Cleaner
namespaces = { namespaces = {
'atom': "http://www.w3.org/2005/Atom",
'openSearch': "http://a9.com/-/spec/opensearchrss/1.0/", 'openSearch': "http://a9.com/-/spec/opensearchrss/1.0/",
'blogger': "http://schemas.google.com/blogger/2008", 'blogger': "http://schemas.google.com/blogger/2008",
'rdf': "http://www.w3.org/1999/02/22-rdf-syntax-ns#", 'rdf': "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
@ -23,48 +21,37 @@ namespaces = {
'syn': "http://purl.org/rss/1.0/modules/syndication/", 'syn': "http://purl.org/rss/1.0/modules/syndication/",
'admin': "http://webns.net/mvcb/", 'admin': "http://webns.net/mvcb/",
'feedburner': "http://rssnamespace.org/feedburner/ext/1.0", 'feedburner': "http://rssnamespace.org/feedburner/ext/1.0",
'content': "http://purl.org/rss/1.0/modules/content/",
'wfw': "http://wellformedweb.org/CommentAPI/", 'wfw': "http://wellformedweb.org/CommentAPI/",
'dc': "http://purl.org/dc/elements/1.1/", 'dc': "http://purl.org/dc/elements/1.1/",
'atom': "http://www.w3.org/2005/Atom", 'atom': "http://www.w3.org/2005/Atom",
'sy': "http://purl.org/rss/1.0/modules/syndication/", 'sy': "http://purl.org/rss/1.0/modules/syndication/",
'slash': "http://purl.org/rss/1.0/modules/slash/" 'slash': "http://purl.org/rss/1.0/modules/slash/",
'atom': "http://www.w3.org/2005/Atom",
'content': "http://purl.org/rss/1.0/modules/content/",
'media': "http://search.yahoo.com/mrss/",
} }
#~ import zlib
#~
#~ READ_BLOCK_SIZE = 1024 * 8
#~ def decompress_stream(fileobj):
#~ result = StringIO()
#~
#~ d = zlib.decompressobj(16 + zlib.MAX_WBITS)
#~ for chunk in iter(partial(response.raw.read, READ_BLOCK_SIZE), ''):
#~ result.write(d.decompress(chunk))
#~
#~ result.seek(0)
#~ return result
#~ parser = etree.XMLParser(remove_blank_text=True, ns_clean=True)
#~ tree = etree.parse(metadata, parser)
#~ root = tree.getroot()
from email.utils import parsedate_tz, mktime_tz from email.utils import parsedate_tz, mktime_tz
class feed_reader: class feed_reader:
"""parse a list of feeds and return details as dictionary data"""
#create the html cleaner, this is to clean out unwanted html tags in the description text #create the html cleaner, this is to clean out unwanted html tags in the description text
html_cleaner = Cleaner() html_cleaner = Cleaner()
html_cleaner.javascript = True html_cleaner.javascript = True
html_cleaner.style = True html_cleaner.style = True
html_cleaner.remove_tags = ['script', 'iframe', 'link', 'style'] html_cleaner.remove_tags = ['script', 'iframe', 'link', 'style', 'img']
filter_by_date = datetime.datetime.now() - datetime.timedelta(days=int(1.5*365)) # 1 and a half years ago filter_by_date = datetime.datetime.now() - datetime.timedelta(days=int(1.5*365)) # 1 and a half years ago
#html_cleaner.allow_tags = ['script', 'iframe', 'link', 'style']
#html_cleaner.kill_tags = ['script', 'iframe', 'link', 'style'] html_img_cleaner = Cleaner(allow_tags=['img'], remove_unknown_tags=False)
html_img_cleaner.allow_tags = ['img']
html_parser = lxml.etree.HTMLParser()
xml_parser = lxml.etree.XMLParser(remove_blank_text=True, ns_clean=True, encoding='utf-8')
def __init__(self, feed_details, timeout=5): def __init__(self, feed_details, timeout=5):
self.results = {} self.results = {}
parser = lxml.etree.XMLParser(remove_blank_text=True, ns_clean=True, encoding='utf-8')
for feed_info in feed_details: for feed_info in feed_details:
self.url = feed_info.get('url') self.url = feed_info.get('url')
self.author = feed_info.get('author') self.author = feed_info.get('author')
@ -73,27 +60,67 @@ class feed_reader:
response = requests.get(feed_info.get('url'), stream=True, timeout=timeout) response = requests.get(feed_info.get('url'), stream=True, timeout=timeout)
if response.headers.get('content-encoding') == 'gzip': if response.headers.get('content-encoding') == 'gzip':
response.raw.read = functools.partial(response.raw.read, decode_content=True) response.raw.read = functools.partial(response.raw.read, decode_content=True)
self.feed = lxml.etree.parse(response.raw, parser) try:
self.feed = lxml.etree.parse(response.raw, self.xml_parser)
except:
continue
else: else:
fp = open(feed_info.get('url'), 'r') with open(os.path.abspath(feed_info.get('url')), 'r') as file_stream:
self.feed = lxml.etree.parse(fp, parser) try:
self.feed = lxml.etree.parse(file_stream, self.xml_parser)
except:
continue
self.feed = self.feed.getroot() self.feed = self.feed.getroot()
# rss feed defaults
self.channel_image = self.fetch_node_text(self.feed, 'channel/image/url', '')
self.parse_feed() self.parse_feed()
def convert_rfc822_to_datetime(self, rfcdate): def convert_rfc822_to_datetime(self, rfcdate):
"""rss uses rfc822 dates so lets convert them to datetime for use later"""
if len(rfcdate): if len(rfcdate):
parsed_rfcdate = parsedate_tz( rfcdate ) parsed_rfcdate = parsedate_tz(rfcdate)
if not parsed_rfcdate: if not parsed_rfcdate:
return None return None
return datetime.datetime.fromtimestamp( return datetime.datetime.fromtimestamp(
mktime_tz(parsed_rfcdate), pytz.utc ).replace(tzinfo=None) mktime_tz(parsed_rfcdate), pytz.utc).replace(tzinfo=None)
return None return None
def clean_up_text(self, text): def clean_up_text(self, text):
"""strip out any dirty tags like <script> they may break the sites""" """strip out any dirty tags like <script> they may break the sites"""
return self.html_cleaner.clean_html(text) return self.html_cleaner.clean_html(text)
def fetch_image_from_node_text(self, text):
description = lxml.etree.parse(StringIO.StringIO(text), self.html_parser)
for image in description.xpath('.//img'):
return image.get('src')
return None
def fetch_image(self, node):
"""Try and get an image from an item in the feed, use various fall back methods"""
image = node.xpath('media:thumbnail', namespaces=namespaces)
if image:
return image[0].get('url', '')
# no media:thumbnail so lets try and grab an image from content:encoded
image = node.xpath('content:encoded', namespaces=namespaces)
if image:
image = self.fetch_image_from_node_text(image[0].text)
if image:
return image
# final attempt at getting an image from the item using description
result = self.fetch_node_text(node, 'description')
if result:
image = self.fetch_image_from_node_text(result)
if image:
return image
# no image so lets fall back to the channel image if it exists
return self.channel_image
def fetch_node_text(self, node, name, default=''): def fetch_node_text(self, node, name, default=''):
"""fetch the text from the node we are given, we are working in unicode """fetch the text from the node we are given, we are working in unicode
so decode byte strings to unicode""" so decode byte strings to unicode"""
@ -106,10 +133,10 @@ class feed_reader:
else: else:
return default return default
def fetch_node_attribute(self, node, names, attribs, default): def fetch_node_attribute(self, node, name, attribs, default):
result = node.xpath('./%s' % name) result = node.xpath('./%s' % name)
if result: if result:
return result.get(atrribs, '') return result.get(attribs, '')
else: else:
return default return default
@ -117,7 +144,7 @@ class feed_reader:
"""extract the authors name from the author text node""" """extract the authors name from the author text node"""
return author.split('(')[-1].strip(')') return author.split('(')[-1].strip(')')
def filter(self, node, tags=None): def filter_by_tags(self, node, tags=None):
"""filter the feed out by category tag, if no tags assume its pre filtered""" """filter the feed out by category tag, if no tags assume its pre filtered"""
if self.tags is None: if self.tags is None:
return True return True
@ -130,14 +157,13 @@ class feed_reader:
"""Parse the items in the feed, filter out bad data and put in defaults""" """Parse the items in the feed, filter out bad data and put in defaults"""
for item in self.feed.xpath('.//item', namespaces=namespaces): for item in self.feed.xpath('.//item', namespaces=namespaces):
date = self.convert_rfc822_to_datetime(self.fetch_node_text(item, 'pubDate')) date = self.convert_rfc822_to_datetime(self.fetch_node_text(item, 'pubDate'))
if date > self.filter_by_date and self.filter(item): if date > self.filter_by_date and self.filter_by_tags(item):
self.filter(item)
self.results[date] = { self.results[date] = {
'title': self.fetch_node_text(item, 'title'), 'title': self.fetch_node_text(item, 'title'),
'date': date, 'date': date,
'url': self.fetch_node_text(item, 'link'), 'url': self.fetch_node_text(item, 'link'),
'author': self.format_author(self.fetch_node_text(item, 'author', self.author)), 'author': self.format_author(self.fetch_node_text(item, 'author', self.author)),
'image': self.fetch_node_text(item, 'image'), 'image': self.fetch_image(item),
'description': self.clean_up_text(self.fetch_node_text(item, 'description'))} 'description': self.clean_up_text(self.fetch_node_text(item, 'description'))}
def __iter__(self): def __iter__(self):
@ -146,12 +172,10 @@ class feed_reader:
#print str(self.results[order]['date']) + ' - ' + self.results[order]['author'] + ' - ' + self.results[order]['title'] #print str(self.results[order]['date']) + ' - ' + self.results[order]['author'] + ' - ' + self.results[order]['title']
yield self.results[order] yield self.results[order]
rss_feeds = [ if __name__ == "__main__":
{'author':'Simon Ridley', 'url': 'http://waistcoatforensicator.blogspot.com/feeds/posts/default?alt=rss'}, rss_tests = [
{'author':'Mathew Beddow', 'tags': ['tech'], 'url': 'http://www.matthewbeddow.co.uk/?feed=rss2'}, {'author': 'Mike McRoberts', 'url': './rss_invalid.xml'},
{'author':'Mike McRoberts', 'url': 'http://thearduinoguy.org/?feed=rss2'}] {'author': 'Mike McRoberts', 'url': './rss_no_tags.xml'}]
test = feed_reader(rss_tests)
#~ import .constants
test = feed_reader(rss_feeds)
for item in test:
pass

0
site/static/template/images/example-01.jpg Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 94 KiB

After

Width:  |  Height:  |  Size: 94 KiB

0
site/static/template/images/example-02.jpg Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 93 KiB

After

Width:  |  Height:  |  Size: 93 KiB

0
site/static/template/images/example-03.jpg Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 53 KiB

After

Width:  |  Height:  |  Size: 53 KiB

0
site/static/template/images/hackspace.png Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 1.9 KiB

After

Width:  |  Height:  |  Size: 1.9 KiB

0
site/static/template/images/hackspace.svg Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 100 KiB

After

Width:  |  Height:  |  Size: 100 KiB

0
site/static/template/images/icon.png Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB

0
site/static/template/images/tile-01.jpg Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 14 KiB

0
site/static/template/images/tile-02.jpg Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 8.7 KiB

After

Width:  |  Height:  |  Size: 8.7 KiB

0
site/widgets/__init__.py Normal file → Executable file
View File

0
site/widgets/__init__.pyc Normal file → Executable file
View File

View File

@ -3,43 +3,36 @@ from scaffold.web import www
class control(www.default.html_ui): class control(www.default.html_ui):
"""Image cycle widgets""" """Image cycle widgets"""
view=[] view = []
buttons=[] buttons = []
content=[] content = []
count=0 count = 0
offset=60
height=300
width=400
with open(os.path.abspath('./widgets/banner_slider.js')) as fp: with open(os.path.abspath('./widgets/banner_slider.js')) as fp:
script = [fp.read()] script = [fp.read()]
#~ def javascript(self):
#~ return fp.read()
#~ self.script.append()
def create(self): def create(self):
self.reset() self.reset()
def reset(self): def reset(self):
self.view=[] self.view = []
self.buttons=[] self.buttons = []
self.content=[] self.content = []
def append(self,image,link,title,intro=''): def append(self, image, link, title, intro=''):
htm = u'<a href="%s" ><img src="%s" /><div class="content">%s<br />%s</div></a>'%(link,image,title,intro) if link:
self.content.append(htm) self.content.append(u'<a href="%s" ><img src="%s" /><div class="content">%s<br />%s</div></a>' % (link, image, title, intro))
else:
self.content.append(u'<img src="%s" /><div class="content">%s<br />%s</div>' % (image, title, intro))
def render(self): def render(self):
#~ self.script.append(self.javascript()) self.count += 0
self.count+=1
htm = u'<div class="banner-slide" ng-app="myApp" ng-controller="sliderController">' htm = u'<div class="banner-slide" ng-app="myApp" ng-controller="sliderController">'
htm += u'<ul style="%s" ng-switch on="currentSlide" ng-init="length=%d;">' % (self.height, len(self.content)) htm += u'<ul ng-switch on="currentSlide" ng-init="length=%d;">' % (len(self.content))
count = 0 count = 0
for item in self.content: for item in self.content:
htm += u'<li class="slide" ng-switch-when="%s">%s</li>' % (count, item) htm += u'<li class="slide" ng-switch-when="%s">%s</li>' % (count, item)
count += 1 count += 1
#htm += '''<li class="slide" ng-repeat="slide in slides" ng-hide="!isCurrentSlideIndex($index)" ng-show="isCurrentSlideIndex($index)"><a href="{{slide.link}}" ><img src="{{slide.src}}" /><div class="content">{{slide.title}}<br />{{slide.description}}</div></a></li>'''
htm += u'<li style="clear:both;"></li></ul>' htm += u'<li style="clear:both;"></li></ul>'
htm += u'<div ng-click="prev()" title="Previous" role="button" class="slide-button left">&lt;</div>' htm += u'<div ng-click="prev()" title="Previous" role="button" class="slide-button left">&lt;</div>'
htm += u'<div ng-click="next()" title="Next" role="button" class="slide-button right">&gt;</div>' htm += u'<div ng-click="next()" title="Next" role="button" class="slide-button right">&gt;</div>'

Binary file not shown.

0
site/widgets/footer-content.pyc Normal file → Executable file
View File

0
site/widgets/footer.pyc Normal file → Executable file
View File

0
site/widgets/footer_content.py Normal file → Executable file
View File

0
site/widgets/footer_content.pyc Normal file → Executable file
View File

0
site/widgets/google_group.pyc Normal file → Executable file
View File

0
site/widgets/google_groups.py Normal file → Executable file
View File

0
site/widgets/google_groups.pyc Normal file → Executable file
View File

0
site/widgets/google_hangout.py Normal file → Executable file
View File

0
site/widgets/google_hangout.pyc Normal file → Executable file
View File

0
site/widgets/header_strip.py Normal file → Executable file
View File

0
site/widgets/header_strip.pyc Normal file → Executable file
View File

0
site/widgets/headerstrip.pyc Normal file → Executable file
View File

0
site/widgets/loginbox.py Normal file → Executable file
View File

0
site/widgets/loginbox.pyc Normal file → Executable file
View File

0
site/widgets/readme Normal file → Executable file
View File

View File

@ -20,11 +20,12 @@ class control(www.default.html_ui):
for project in self.data: for project in self.data:
htm += u'<div class="tile">' htm += u'<div class="tile">'
if project.get('image'): if project.get('image'):
htm += u'<div class="tile-img"><img src="%s"/></div>' % project.get('image') #~ htm += u'<div class="tile-img" style="%s"><img src="%s"/></div>' % (background, project.get('image'))
htm += u'<div class="tile-img" style="background:center no-repeat url(%s);background-size:contain;"></div>' % project.get('image')
else: else:
htm += u'<div class="tile-img"></div>' htm += u'<div class="tile-img"></div>'
htm += u'<header class="tile-content"><h2><a href="%s">%s</a> By %s</h2></header>' % ( htm += u'<header class="tile-content"><h2><a href="%s">%s</a> By %s</h2></header>' % (
project.get('link'), project.get('title'),project.get('author')) project.get('link'), project.get('title'), project.get('author'))
htm += u'<div class="tile-content"><p>%s</p></div>' % (project.get('description')) htm += u'<div class="tile-content"><p>%s</p></div>' % (project.get('description'))
htm += u'</div>' htm += u'</div>'
return htm return htm

Binary file not shown.