More fixes to the feed parser, implemented ordering again

2018-09-20 20:53:48 +01:00 · 2018-09-20 20:53:48 +01:00 · 3b9b943db1
parent b3cd4aab0b
commit 3b9b943db1
3 changed files with 95 additions and 60 deletions
--- a/mhackspace/feeds/helper.py
+++ b/mhackspace/feeds/helper.py
@ -1,26 +1,27 @@
 # -*- coding: utf-8 -*-
 import os
 import tempfile
 import requests
 import logging
 from io import BytesIO
 from time import mktime
 from datetime import datetime
 from django.conf import settings
 from django.core.files import File
 from stdimage.utils import render_variations
 from mhackspace.feeds.reader import fetch_feeds
-from mhackspace.feeds.models import Feed, Article, image_variations
+from django.core.files import File
 from mhackspace.feeds.reader import fetch_feeds
 from mhackspace.feeds.models import Feed, Article
 logger = logging.getLogger(__name__)
 def import_feeds(feed=False):
    remove_old_articles()
-    articles = []
+    articles = fetch_feeds(get_active_feeds(feed))
-    for article in fetch_feeds(get_active_feeds(feed)):
+    article_objects = []
    # for author in articles:
    for article in articles:
        date = datetime.fromtimestamp(mktime(article["date"]))
-        articles.append(
+        article_objects.append(
            Article(
                url=article["url"],
                feed=Feed.objects.get(pk=article["feed"]),
@ -30,7 +31,7 @@ def import_feeds(feed=False):
                date=date,
            )
        )
-    articles = Article.objects.bulk_create(articles)
+    articles = Article.objects.bulk_create(article_objects)
    download_remote_images()
    return articles
@ -43,7 +44,6 @@ def remove_old_articles():
 def download_remote_images():
    for article in Article.objects.all():
        print(article.original_image)
        if not article.original_image:
            continue
        try:
@ -57,21 +57,13 @@ def download_remote_images():
            return
        try:
            tmpfile = tempfile.TemporaryFile(mode='w+b')
            tmpfile.write(result.content)
            article.image.save(
                os.path.basename(article.original_image),
-                File(tmpfile),
+                File(BytesIO(result.content)),
            )
            file_path = f'{settings.MEDIA_ROOT}/{article.image.file}'
            render_variations(file_path, image_variations, replace=True)
            article.save()
        except Exception as e:
            logger.exception(result)
        finally:
            tmpfile.close()
 def get_active_feeds(feed=False):
--- a/mhackspace/feeds/reader.py
+++ b/mhackspace/feeds/reader.py
@ -1,20 +1,34 @@
 import lxml
 import feedparser
 import datetime
 from io import StringIO
 from operator import itemgetter
 from lxml.html.clean import Cleaner
 from django.utils.html import escape
 filter_by_date_expire = datetime.datetime.now() - datetime.timedelta(
    days=int(1.5 * 365)
 )
-namespaces = {}
+
-urls = [
+def filter_by_tags(self, node, tags=None):
-    "https://feeds.feedburner.com/projects-jl",
+    """filter the feed out by category tag, if no tags assume its pre filtered"""
-    "https://hackaday.com/tag/emf-camp-2018/feed/",
+    if self.tags is None:
-    "https://maidstone-hackspace.org.uk/blog/rss/",
+        return True
-    "http://webboggles.com/feed/",
+    for category in node.xpath("./category", namespaces=namespaces):
-    "https://blog.digitaloctave.com/rss.xml",
+        if category.text.lower() in self.tags:
-]
+            return True
-html_parser = lxml.etree.HTMLParser()
+    return False
 def filter_by_date(self, date):
    """filter the feed out by date"""
    if self.enable_date_filter is False:
        return True
    if date > self.filter_by_date_expire:
        return True
    return False
 def parse_content(content):
@ -29,25 +43,13 @@ def parse_content(content):
 def fetch_image_from_node_text(text):
    html_parser = lxml.etree.HTMLParser()
    description = lxml.etree.parse(StringIO(text), html_parser)
    for image in description.xpath(".//img"):
        return image.get("src")
    return None
 def fetch_node_text(node, name, default=u""):
    """fetch the text from the node we are given, we are working in unicode
        so decode byte strings to unicode"""
    result = node.xpath("./%s" % name)
    if result is None or len(result) is 0:
        return default
    if type(result[-1].text) is str:
        return result[-1].text.encode("utf-8")
    else:
        return result[-1].text
 def fetch_image(post, node, namespaces):
    """Try and get an image from an item in the feed, use various fall back methods"""
    if hasattr(post, "media_thumbnail"):
@ -62,9 +64,7 @@ def fetch_image(post, node, namespaces):
            return image
    # final attempt at getting an image from the item using description
-    result = fetch_node_text(node, "description")
+    image = fetch_image_from_node_text(post.description)
    if result:
        image = fetch_image_from_node_text(result)
    if image:
        return image
@ -73,10 +73,12 @@ def fetch_image(post, node, namespaces):
 def fetch_feeds(feeds):
-    articles = []
+    articles = {}
    print(feeds)
    for feed in feeds:
        url = feed.get("url")
        author = feed.get("author")
        parsed = feedparser.parse(url)
        namespaces = {}
        if hasattr(parsed, "namespaces"):
@ -84,10 +86,13 @@ def fetch_feeds(feeds):
        feed_image = ""
        if hasattr(parsed.feed, "image"):
            feed_image = parsed.feed.image.get("href")
        print(author)
        for post in parsed.entries:
            root_node = parse_content(post.description)
            image = fetch_image(post, root_node, namespaces) or feed_image
-            yield {
+
            articles.setdefault(author, []).append(
                {
                    "url": post.link,
                    "feed": feed.get("id"),
                    "title": post.title,
@ -96,4 +101,37 @@ def fetch_feeds(feeds):
                    "date": post.published_parsed,
                    "image": image,
                }
-    return articles
+            )
    # order authors articles by date
    for author in articles.keys():
        articles[author] = sorted(
            articles[author], key=itemgetter("date"), reverse=True
        )
    return [f for f in alternate_dict_and_sort_by_list_item_key(articles)]
    # return articles
 def alternate_dict_and_sort_by_list_item_key(dict_of_lists, sort_key="date"):
    """ take a dictonary of ordered lists, step through each row and sort the current
    item position in each list and yield the result.
    basically gives the ordering of date while stepping through the blog entries to make it fair
    for people who do not blog often. """
    longest_list_length = max(
        [len(dict_of_lists[d]) for d in dict_of_lists.keys()] + [0]
    )
    # order each feed by date, newest date at the end of the list so it can be poped
    for author in dict_of_lists:
        dict_of_lists[author].sort(key=itemgetter("date"), reverse=False)
        # now iterate through author lists, popping the first elements and order the current item
        # from each list by date
        for i in range(0, longest_list_length):
            # get first value from each key, and order the list by sort key which is date by default
            feed_row = [d.pop() for d in dict_of_lists.values() if d]
            results = sorted(feed_row, key=itemgetter(sort_key), reverse=True)
            for item in results:
                yield item
--- a/mhackspace/feeds/templatetags/feed_views.py
+++ b/mhackspace/feeds/templatetags/feed_views.py
@ -4,6 +4,11 @@ from mhackspace.feeds.models import Article
 register = template.Library()
-@register.inclusion_tag('feeds/list.html')
+
@register.inclusion_tag("feeds/list.html")
 def show_feeds():
-    return {'articles': Article.objects.select_related('feed').filter(displayed=True, feed__enabled=True)}
+    return {
        "articles": Article.objects.select_related("feed").filter(
            displayed=True, feed__enabled=True
        )
    }