From 3b9b943db1efec2e9871d3e98084b446944cd19a Mon Sep 17 00:00:00 2001 From: Oliver Marks Date: Thu, 20 Sep 2018 20:53:48 +0100 Subject: [PATCH] More fixes to the feed parser, implemented ordering again --- mhackspace/feeds/helper.py | 32 +++--- mhackspace/feeds/reader.py | 114 +++++++++++++------- mhackspace/feeds/templatetags/feed_views.py | 9 +- 3 files changed, 95 insertions(+), 60 deletions(-) diff --git a/mhackspace/feeds/helper.py b/mhackspace/feeds/helper.py index 84f9f4d..cf2cb81 100644 --- a/mhackspace/feeds/helper.py +++ b/mhackspace/feeds/helper.py @@ -1,26 +1,27 @@ # -*- coding: utf-8 -*- import os -import tempfile import requests import logging +from io import BytesIO from time import mktime from datetime import datetime -from django.conf import settings -from django.core.files import File -from stdimage.utils import render_variations -from mhackspace.feeds.reader import fetch_feeds -from mhackspace.feeds.models import Feed, Article, image_variations +from django.core.files import File + +from mhackspace.feeds.reader import fetch_feeds +from mhackspace.feeds.models import Feed, Article logger = logging.getLogger(__name__) def import_feeds(feed=False): remove_old_articles() - articles = [] - for article in fetch_feeds(get_active_feeds(feed)): + articles = fetch_feeds(get_active_feeds(feed)) + article_objects = [] + # for author in articles: + for article in articles: date = datetime.fromtimestamp(mktime(article["date"])) - articles.append( + article_objects.append( Article( url=article["url"], feed=Feed.objects.get(pk=article["feed"]), @@ -30,7 +31,7 @@ def import_feeds(feed=False): date=date, ) ) - articles = Article.objects.bulk_create(articles) + articles = Article.objects.bulk_create(article_objects) download_remote_images() return articles @@ -43,7 +44,6 @@ def remove_old_articles(): def download_remote_images(): for article in Article.objects.all(): - print(article.original_image) if not article.original_image: continue try: @@ -57,21 +57,13 @@ def download_remote_images(): return try: - tmpfile = tempfile.TemporaryFile(mode='w+b') - tmpfile.write(result.content) - article.image.save( os.path.basename(article.original_image), - File(tmpfile), + File(BytesIO(result.content)), ) - - file_path = f'{settings.MEDIA_ROOT}/{article.image.file}' - render_variations(file_path, image_variations, replace=True) article.save() except Exception as e: logger.exception(result) - finally: - tmpfile.close() def get_active_feeds(feed=False): diff --git a/mhackspace/feeds/reader.py b/mhackspace/feeds/reader.py index fedd6cf..fc388ed 100644 --- a/mhackspace/feeds/reader.py +++ b/mhackspace/feeds/reader.py @@ -1,20 +1,34 @@ import lxml import feedparser +import datetime from io import StringIO +from operator import itemgetter from lxml.html.clean import Cleaner from django.utils.html import escape +filter_by_date_expire = datetime.datetime.now() - datetime.timedelta( + days=int(1.5 * 365) +) -namespaces = {} -urls = [ - "https://feeds.feedburner.com/projects-jl", - "https://hackaday.com/tag/emf-camp-2018/feed/", - "https://maidstone-hackspace.org.uk/blog/rss/", - "http://webboggles.com/feed/", - "https://blog.digitaloctave.com/rss.xml", -] -html_parser = lxml.etree.HTMLParser() + +def filter_by_tags(self, node, tags=None): + """filter the feed out by category tag, if no tags assume its pre filtered""" + if self.tags is None: + return True + for category in node.xpath("./category", namespaces=namespaces): + if category.text.lower() in self.tags: + return True + return False + + +def filter_by_date(self, date): + """filter the feed out by date""" + if self.enable_date_filter is False: + return True + if date > self.filter_by_date_expire: + return True + return False def parse_content(content): @@ -29,25 +43,13 @@ def parse_content(content): def fetch_image_from_node_text(text): + html_parser = lxml.etree.HTMLParser() description = lxml.etree.parse(StringIO(text), html_parser) for image in description.xpath(".//img"): return image.get("src") return None -def fetch_node_text(node, name, default=u""): - """fetch the text from the node we are given, we are working in unicode - so decode byte strings to unicode""" - result = node.xpath("./%s" % name) - if result is None or len(result) is 0: - return default - - if type(result[-1].text) is str: - return result[-1].text.encode("utf-8") - else: - return result[-1].text - - def fetch_image(post, node, namespaces): """Try and get an image from an item in the feed, use various fall back methods""" if hasattr(post, "media_thumbnail"): @@ -62,21 +64,21 @@ def fetch_image(post, node, namespaces): return image # final attempt at getting an image from the item using description - result = fetch_node_text(node, "description") - if result: - image = fetch_image_from_node_text(result) - if image: - return image + image = fetch_image_from_node_text(post.description) + if image: + return image # no image so lets fall back to the channel image if it exists return None def fetch_feeds(feeds): - articles = [] + articles = {} + print(feeds) for feed in feeds: url = feed.get("url") + author = feed.get("author") parsed = feedparser.parse(url) namespaces = {} if hasattr(parsed, "namespaces"): @@ -84,16 +86,52 @@ def fetch_feeds(feeds): feed_image = "" if hasattr(parsed.feed, "image"): feed_image = parsed.feed.image.get("href") + print(author) for post in parsed.entries: root_node = parse_content(post.description) image = fetch_image(post, root_node, namespaces) or feed_image - yield { - "url": post.link, - "feed": feed.get("id"), - "title": post.title, - "original_image": image, - "description": post.description, - "date": post.published_parsed, - "image": image, - } - return articles + + articles.setdefault(author, []).append( + { + "url": post.link, + "feed": feed.get("id"), + "title": post.title, + "original_image": image, + "description": post.description, + "date": post.published_parsed, + "image": image, + } + ) + + # order authors articles by date + for author in articles.keys(): + articles[author] = sorted( + articles[author], key=itemgetter("date"), reverse=True + ) + return [f for f in alternate_dict_and_sort_by_list_item_key(articles)] + + # return articles + + +def alternate_dict_and_sort_by_list_item_key(dict_of_lists, sort_key="date"): + """ take a dictonary of ordered lists, step through each row and sort the current + item position in each list and yield the result. + + basically gives the ordering of date while stepping through the blog entries to make it fair + for people who do not blog often. """ + longest_list_length = max( + [len(dict_of_lists[d]) for d in dict_of_lists.keys()] + [0] + ) + + # order each feed by date, newest date at the end of the list so it can be poped + for author in dict_of_lists: + dict_of_lists[author].sort(key=itemgetter("date"), reverse=False) + + # now iterate through author lists, popping the first elements and order the current item + # from each list by date + for i in range(0, longest_list_length): + # get first value from each key, and order the list by sort key which is date by default + feed_row = [d.pop() for d in dict_of_lists.values() if d] + results = sorted(feed_row, key=itemgetter(sort_key), reverse=True) + for item in results: + yield item diff --git a/mhackspace/feeds/templatetags/feed_views.py b/mhackspace/feeds/templatetags/feed_views.py index ca12647..51650c8 100644 --- a/mhackspace/feeds/templatetags/feed_views.py +++ b/mhackspace/feeds/templatetags/feed_views.py @@ -4,6 +4,11 @@ from mhackspace.feeds.models import Article register = template.Library() -@register.inclusion_tag('feeds/list.html') + +@register.inclusion_tag("feeds/list.html") def show_feeds(): - return {'articles': Article.objects.select_related('feed').filter(displayed=True, feed__enabled=True)} + return { + "articles": Article.objects.select_related("feed").filter( + displayed=True, feed__enabled=True + ) + }