improve feed reader image parsing
|
@ -5,9 +5,9 @@ page_menu = [
|
|||
('Contact', '#mailing-list-signup')]
|
||||
|
||||
banner_images = [
|
||||
('/static/template/images/hackspace-banner.png', 'http://maidstone-hackspace.org.uk/', 'title', 'intro text'),
|
||||
('/static/template/images/example-01.jpg', 'http://www.google.co.uk', 'title', 'intro text'),
|
||||
('/static/template/images/example-02.jpg', 'http://www.google.co.uk', 'title', 'intro text')]
|
||||
('/static/template/images/hackspace-banner.png', '', 'title', 'intro text'),
|
||||
('/static/template/images/example-01.jpg', '', 'title', 'intro text'),
|
||||
('/static/template/images/example-02.jpg', '', 'title', 'intro text')]
|
||||
|
||||
tile_images = [
|
||||
('/static/template/images/tile-01.jpg',),
|
||||
|
@ -19,6 +19,7 @@ rss_feeds = [
|
|||
{'author':'Simon Ridley',
|
||||
'url': 'http://waistcoatforensicator.blogspot.com/feeds/posts/default?alt=rss'},
|
||||
{'author':'Mathew Beddow', 'tags': ['tech'], 'url': 'http://www.matthewbeddow.co.uk/?feed=rss2'},
|
||||
#{'author':'Oliver Marks', 'url': 'http://www.digitaloctave.co.uk/rss.xml'},
|
||||
{'author':'Mike McRoberts', 'url': 'http://thearduinoguy.org/?feed=rss2'}]
|
||||
|
||||
kent_hackspace = ['http://www.medwaymakers.co.uk/']
|
||||
|
|
|
@ -108,6 +108,7 @@ def blogs():
|
|||
feed = feed_reader(site.rss_feeds)
|
||||
|
||||
for row in feed:
|
||||
print row.get('image')
|
||||
web.tiles.append(
|
||||
title = row.get('title'),
|
||||
author = row.get('author'),
|
||||
|
@ -148,7 +149,7 @@ def index():
|
|||
|
||||
web.paragraph.append(
|
||||
"""We are in the process of developing Maidstone Hackspace. We're previous members of <span class="info" title="Innovation center medway prototype">(ICMP)</span> and looking to form a new space in the future.
|
||||
At the moment, communication is via google groups, email, and the website. If you're at all intrested please join our <a href="#mailing-list">mailing list</a>
|
||||
At the moment, communication is via google groups, email, and the website. If you're at all intrested please join our <a href="#mailing-list-signup">mailing list</a>
|
||||
and make yourself known!""")
|
||||
web.page.section(web.paragraph.render())
|
||||
|
||||
|
@ -160,7 +161,7 @@ def index():
|
|||
bullet_list.append(
|
||||
("""Build an interactive splash screen to feature on this site.""",))
|
||||
bullet_list.append(
|
||||
(web.link.create('Suggest a new activity', 'Suggest a new activity', '#mailing-list').render(),))
|
||||
(web.link.create('Suggest a new activity', 'Suggest a new activity', '#mailing-list-signup').render(),))
|
||||
|
||||
web.list.create(ordered=False).set_classes('bullet-list')
|
||||
web.list * bullet_list
|
||||
|
@ -180,7 +181,7 @@ if __name__ == "__main__":
|
|||
#~ args = parser.parse_args()
|
||||
#~ print(args.accumulate(args.integers))
|
||||
|
||||
with codecs.open('./index.html', 'w', "utf-8") as fp:
|
||||
with codecs.open('./html/index.html', 'w', "utf-8") as fp:
|
||||
fp.write(index().decode('utf-8'))
|
||||
#~ with open('./html/examples.html', 'w') as fp:
|
||||
#~ fp.write(examples())
|
||||
|
|
|
@ -2,17 +2,15 @@ import os
|
|||
import sys
|
||||
import lxml
|
||||
import pytz
|
||||
import StringIO
|
||||
import datetime
|
||||
import requests
|
||||
import functools
|
||||
import requests.exceptions
|
||||
|
||||
|
||||
#from lxml import etree, objectify
|
||||
from lxml.html.clean import Cleaner
|
||||
|
||||
namespaces = {
|
||||
'atom': "http://www.w3.org/2005/Atom",
|
||||
'openSearch': "http://a9.com/-/spec/opensearchrss/1.0/",
|
||||
'blogger': "http://schemas.google.com/blogger/2008",
|
||||
'rdf': "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
||||
|
@ -23,48 +21,37 @@ namespaces = {
|
|||
'syn': "http://purl.org/rss/1.0/modules/syndication/",
|
||||
'admin': "http://webns.net/mvcb/",
|
||||
'feedburner': "http://rssnamespace.org/feedburner/ext/1.0",
|
||||
'content': "http://purl.org/rss/1.0/modules/content/",
|
||||
'wfw': "http://wellformedweb.org/CommentAPI/",
|
||||
'dc': "http://purl.org/dc/elements/1.1/",
|
||||
'atom': "http://www.w3.org/2005/Atom",
|
||||
'sy': "http://purl.org/rss/1.0/modules/syndication/",
|
||||
'slash': "http://purl.org/rss/1.0/modules/slash/"
|
||||
'slash': "http://purl.org/rss/1.0/modules/slash/",
|
||||
'atom': "http://www.w3.org/2005/Atom",
|
||||
'content': "http://purl.org/rss/1.0/modules/content/",
|
||||
'media': "http://search.yahoo.com/mrss/",
|
||||
}
|
||||
|
||||
|
||||
#~ import zlib
|
||||
#~
|
||||
#~ READ_BLOCK_SIZE = 1024 * 8
|
||||
#~ def decompress_stream(fileobj):
|
||||
#~ result = StringIO()
|
||||
#~
|
||||
#~ d = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
||||
#~ for chunk in iter(partial(response.raw.read, READ_BLOCK_SIZE), ''):
|
||||
#~ result.write(d.decompress(chunk))
|
||||
#~
|
||||
#~ result.seek(0)
|
||||
#~ return result
|
||||
|
||||
|
||||
#~ parser = etree.XMLParser(remove_blank_text=True, ns_clean=True)
|
||||
#~ tree = etree.parse(metadata, parser)
|
||||
#~ root = tree.getroot()
|
||||
|
||||
from email.utils import parsedate_tz, mktime_tz
|
||||
|
||||
class feed_reader:
|
||||
"""parse a list of feeds and return details as dictionary data"""
|
||||
#create the html cleaner, this is to clean out unwanted html tags in the description text
|
||||
html_cleaner = Cleaner()
|
||||
html_cleaner.javascript = True
|
||||
html_cleaner.style = True
|
||||
html_cleaner.remove_tags = ['script', 'iframe', 'link', 'style']
|
||||
html_cleaner.remove_tags = ['script', 'iframe', 'link', 'style', 'img']
|
||||
|
||||
filter_by_date = datetime.datetime.now() - datetime.timedelta(days=int(1.5*365)) # 1 and a half years ago
|
||||
#html_cleaner.allow_tags = ['script', 'iframe', 'link', 'style']
|
||||
#html_cleaner.kill_tags = ['script', 'iframe', 'link', 'style']
|
||||
|
||||
|
||||
html_img_cleaner = Cleaner(allow_tags=['img'], remove_unknown_tags=False)
|
||||
html_img_cleaner.allow_tags = ['img']
|
||||
|
||||
html_parser = lxml.etree.HTMLParser()
|
||||
xml_parser = lxml.etree.XMLParser(remove_blank_text=True, ns_clean=True, encoding='utf-8')
|
||||
|
||||
def __init__(self, feed_details, timeout=5):
|
||||
self.results = {}
|
||||
parser = lxml.etree.XMLParser(remove_blank_text=True, ns_clean=True, encoding='utf-8')
|
||||
for feed_info in feed_details:
|
||||
self.url = feed_info.get('url')
|
||||
self.author = feed_info.get('author')
|
||||
|
@ -73,27 +60,67 @@ class feed_reader:
|
|||
response = requests.get(feed_info.get('url'), stream=True, timeout=timeout)
|
||||
if response.headers.get('content-encoding') == 'gzip':
|
||||
response.raw.read = functools.partial(response.raw.read, decode_content=True)
|
||||
self.feed = lxml.etree.parse(response.raw, parser)
|
||||
try:
|
||||
self.feed = lxml.etree.parse(response.raw, self.xml_parser)
|
||||
except:
|
||||
continue
|
||||
else:
|
||||
fp = open(feed_info.get('url'), 'r')
|
||||
self.feed = lxml.etree.parse(fp, parser)
|
||||
with open(os.path.abspath(feed_info.get('url')), 'r') as file_stream:
|
||||
try:
|
||||
self.feed = lxml.etree.parse(file_stream, self.xml_parser)
|
||||
except:
|
||||
continue
|
||||
|
||||
self.feed = self.feed.getroot()
|
||||
|
||||
# rss feed defaults
|
||||
self.channel_image = self.fetch_node_text(self.feed, 'channel/image/url', '')
|
||||
|
||||
self.parse_feed()
|
||||
|
||||
def convert_rfc822_to_datetime(self, rfcdate):
|
||||
"""rss uses rfc822 dates so lets convert them to datetime for use later"""
|
||||
if len(rfcdate):
|
||||
parsed_rfcdate = parsedate_tz( rfcdate )
|
||||
parsed_rfcdate = parsedate_tz(rfcdate)
|
||||
if not parsed_rfcdate:
|
||||
return None
|
||||
return datetime.datetime.fromtimestamp(
|
||||
mktime_tz(parsed_rfcdate), pytz.utc ).replace(tzinfo=None)
|
||||
mktime_tz(parsed_rfcdate), pytz.utc).replace(tzinfo=None)
|
||||
return None
|
||||
|
||||
def clean_up_text(self, text):
|
||||
"""strip out any dirty tags like <script> they may break the sites"""
|
||||
return self.html_cleaner.clean_html(text)
|
||||
|
||||
def fetch_image_from_node_text(self, text):
|
||||
description = lxml.etree.parse(StringIO.StringIO(text), self.html_parser)
|
||||
for image in description.xpath('.//img'):
|
||||
return image.get('src')
|
||||
return None
|
||||
|
||||
def fetch_image(self, node):
|
||||
"""Try and get an image from an item in the feed, use various fall back methods"""
|
||||
image = node.xpath('media:thumbnail', namespaces=namespaces)
|
||||
if image:
|
||||
return image[0].get('url', '')
|
||||
|
||||
# no media:thumbnail so lets try and grab an image from content:encoded
|
||||
image = node.xpath('content:encoded', namespaces=namespaces)
|
||||
if image:
|
||||
image = self.fetch_image_from_node_text(image[0].text)
|
||||
if image:
|
||||
return image
|
||||
|
||||
# final attempt at getting an image from the item using description
|
||||
result = self.fetch_node_text(node, 'description')
|
||||
if result:
|
||||
image = self.fetch_image_from_node_text(result)
|
||||
if image:
|
||||
return image
|
||||
|
||||
# no image so lets fall back to the channel image if it exists
|
||||
return self.channel_image
|
||||
|
||||
def fetch_node_text(self, node, name, default=''):
|
||||
"""fetch the text from the node we are given, we are working in unicode
|
||||
so decode byte strings to unicode"""
|
||||
|
@ -106,10 +133,10 @@ class feed_reader:
|
|||
else:
|
||||
return default
|
||||
|
||||
def fetch_node_attribute(self, node, names, attribs, default):
|
||||
def fetch_node_attribute(self, node, name, attribs, default):
|
||||
result = node.xpath('./%s' % name)
|
||||
if result:
|
||||
return result.get(atrribs, '')
|
||||
return result.get(attribs, '')
|
||||
else:
|
||||
return default
|
||||
|
||||
|
@ -117,7 +144,7 @@ class feed_reader:
|
|||
"""extract the authors name from the author text node"""
|
||||
return author.split('(')[-1].strip(')')
|
||||
|
||||
def filter(self, node, tags=None):
|
||||
def filter_by_tags(self, node, tags=None):
|
||||
"""filter the feed out by category tag, if no tags assume its pre filtered"""
|
||||
if self.tags is None:
|
||||
return True
|
||||
|
@ -130,14 +157,13 @@ class feed_reader:
|
|||
"""Parse the items in the feed, filter out bad data and put in defaults"""
|
||||
for item in self.feed.xpath('.//item', namespaces=namespaces):
|
||||
date = self.convert_rfc822_to_datetime(self.fetch_node_text(item, 'pubDate'))
|
||||
if date > self.filter_by_date and self.filter(item):
|
||||
self.filter(item)
|
||||
if date > self.filter_by_date and self.filter_by_tags(item):
|
||||
self.results[date] = {
|
||||
'title': self.fetch_node_text(item, 'title'),
|
||||
'date': date,
|
||||
'url': self.fetch_node_text(item, 'link'),
|
||||
'author': self.format_author(self.fetch_node_text(item, 'author', self.author)),
|
||||
'image': self.fetch_node_text(item, 'image'),
|
||||
'image': self.fetch_image(item),
|
||||
'description': self.clean_up_text(self.fetch_node_text(item, 'description'))}
|
||||
|
||||
def __iter__(self):
|
||||
|
@ -145,13 +171,11 @@ class feed_reader:
|
|||
for order in sorted(self.results.keys(), reverse=True):
|
||||
#print str(self.results[order]['date']) + ' - ' + self.results[order]['author'] + ' - ' + self.results[order]['title']
|
||||
yield self.results[order]
|
||||
|
||||
rss_feeds = [
|
||||
{'author':'Simon Ridley', 'url': 'http://waistcoatforensicator.blogspot.com/feeds/posts/default?alt=rss'},
|
||||
{'author':'Mathew Beddow', 'tags': ['tech'], 'url': 'http://www.matthewbeddow.co.uk/?feed=rss2'},
|
||||
{'author':'Mike McRoberts', 'url': 'http://thearduinoguy.org/?feed=rss2'}]
|
||||
|
||||
#~ import .constants
|
||||
test = feed_reader(rss_feeds)
|
||||
for item in test:
|
||||
pass
|
||||
if __name__ == "__main__":
|
||||
rss_tests = [
|
||||
{'author': 'Mike McRoberts', 'url': './rss_invalid.xml'},
|
||||
{'author': 'Mike McRoberts', 'url': './rss_no_tags.xml'}]
|
||||
|
||||
test = feed_reader(rss_tests)
|
||||
|
||||
|
|
Before Width: | Height: | Size: 94 KiB After Width: | Height: | Size: 94 KiB |
Before Width: | Height: | Size: 93 KiB After Width: | Height: | Size: 93 KiB |
Before Width: | Height: | Size: 53 KiB After Width: | Height: | Size: 53 KiB |
Before Width: | Height: | Size: 1.9 KiB After Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 100 KiB After Width: | Height: | Size: 100 KiB |
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 8.7 KiB After Width: | Height: | Size: 8.7 KiB |
|
@ -3,43 +3,36 @@ from scaffold.web import www
|
|||
|
||||
class control(www.default.html_ui):
|
||||
"""Image cycle widgets"""
|
||||
view=[]
|
||||
buttons=[]
|
||||
content=[]
|
||||
count=0
|
||||
offset=60
|
||||
height=300
|
||||
width=400
|
||||
view = []
|
||||
buttons = []
|
||||
content = []
|
||||
count = 0
|
||||
|
||||
with open(os.path.abspath('./widgets/banner_slider.js')) as fp:
|
||||
script = [fp.read()]
|
||||
|
||||
#~ def javascript(self):
|
||||
#~ return fp.read()
|
||||
#~ self.script.append()
|
||||
|
||||
def create(self):
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.view=[]
|
||||
self.buttons=[]
|
||||
self.content=[]
|
||||
self.view = []
|
||||
self.buttons = []
|
||||
self.content = []
|
||||
|
||||
def append(self,image,link,title,intro=''):
|
||||
htm = u'<a href="%s" ><img src="%s" /><div class="content">%s<br />%s</div></a>'%(link,image,title,intro)
|
||||
self.content.append(htm)
|
||||
def append(self, image, link, title, intro=''):
|
||||
if link:
|
||||
self.content.append(u'<a href="%s" ><img src="%s" /><div class="content">%s<br />%s</div></a>' % (link, image, title, intro))
|
||||
else:
|
||||
self.content.append(u'<img src="%s" /><div class="content">%s<br />%s</div>' % (image, title, intro))
|
||||
|
||||
def render(self):
|
||||
#~ self.script.append(self.javascript())
|
||||
self.count+=1
|
||||
self.count += 0
|
||||
htm = u'<div class="banner-slide" ng-app="myApp" ng-controller="sliderController">'
|
||||
htm += u'<ul style="%s" ng-switch on="currentSlide" ng-init="length=%d;">' % (self.height, len(self.content))
|
||||
htm += u'<ul ng-switch on="currentSlide" ng-init="length=%d;">' % (len(self.content))
|
||||
count = 0
|
||||
for item in self.content:
|
||||
htm += u'<li class="slide" ng-switch-when="%s">%s</li>' % (count, item)
|
||||
count += 1
|
||||
#htm += '''<li class="slide" ng-repeat="slide in slides" ng-hide="!isCurrentSlideIndex($index)" ng-show="isCurrentSlideIndex($index)"><a href="{{slide.link}}" ><img src="{{slide.src}}" /><div class="content">{{slide.title}}<br />{{slide.description}}</div></a></li>'''
|
||||
htm += u'<li style="clear:both;"></li></ul>'
|
||||
htm += u'<div ng-click="prev()" title="Previous" role="button" class="slide-button left"><</div>'
|
||||
htm += u'<div ng-click="next()" title="Next" role="button" class="slide-button right">></div>'
|
||||
|
|
|
@ -8,23 +8,24 @@ class control(www.default.html_ui):
|
|||
|
||||
def append(self, title, author, date, link, image, description=''):
|
||||
self.data.append({
|
||||
'title': title,
|
||||
'author': author,
|
||||
'date': date,
|
||||
'link': link,
|
||||
'image': image,
|
||||
'description': description})
|
||||
'title': title,
|
||||
'author': author,
|
||||
'date': date,
|
||||
'link': link,
|
||||
'image': image,
|
||||
'description': description})
|
||||
|
||||
def render(self):
|
||||
htm = u''
|
||||
for project in self.data:
|
||||
htm += u'<div class="tile">'
|
||||
if project.get('image'):
|
||||
htm += u'<div class="tile-img"><img src="%s"/></div>' % project.get('image')
|
||||
#~ htm += u'<div class="tile-img" style="%s"><img src="%s"/></div>' % (background, project.get('image'))
|
||||
htm += u'<div class="tile-img" style="background:center no-repeat url(%s);background-size:contain;"></div>' % project.get('image')
|
||||
else:
|
||||
htm += u'<div class="tile-img"></div>'
|
||||
htm += u'<header class="tile-content"><h2><a href="%s">%s</a> By %s</h2></header>' % (
|
||||
project.get('link'), project.get('title'),project.get('author'))
|
||||
project.get('link'), project.get('title'), project.get('author'))
|
||||
htm += u'<div class="tile-content"><p>%s</p></div>' % (project.get('description'))
|
||||
htm += u'</div>'
|
||||
return htm
|
||||
|
|