More fixes to the feed parser, implemented ordering again

2018-09-20 20:53:48 +01:00 · 2018-09-20 20:53:48 +01:00 · 3b9b943db1
parent b3cd4aab0b
commit 3b9b943db1
3 changed files with 95 additions and 60 deletions
--- a/mhackspace/feeds/helper.py
+++ b/mhackspace/feeds/helper.py
@ -1,26 +1,27 @@
 # -*- coding: utf-8 -*-
 import os
-import tempfile
 import requests
 import logging
+from io import BytesIO
 from time import mktime
 from datetime import datetime
-from django.conf import settings
-from django.core.files import File
-from stdimage.utils import render_variations
-from mhackspace.feeds.reader import fetch_feeds

-from mhackspace.feeds.models import Feed, Article, image_variations
+from django.core.files import File
+
+from mhackspace.feeds.reader import fetch_feeds
+from mhackspace.feeds.models import Feed, Article

 logger = logging.getLogger(__name__)


 def import_feeds(feed=False):
    remove_old_articles()
-    articles = []
-    for article in fetch_feeds(get_active_feeds(feed)):
+    articles = fetch_feeds(get_active_feeds(feed))
+    article_objects = []
+    # for author in articles:
+    for article in articles:
        date = datetime.fromtimestamp(mktime(article["date"]))
-        articles.append(
+        article_objects.append(
            Article(
                url=article["url"],
                feed=Feed.objects.get(pk=article["feed"]),
@ -30,7 +31,7 @@ def import_feeds(feed=False):
                date=date,
            )
        )
-    articles = Article.objects.bulk_create(articles)
+    articles = Article.objects.bulk_create(article_objects)
    download_remote_images()
    return articles

@ -43,7 +44,6 @@ def remove_old_articles():

 def download_remote_images():
    for article in Article.objects.all():
-        print(article.original_image)
        if not article.original_image:
            continue
        try:
@ -57,21 +57,13 @@ def download_remote_images():
            return

        try:
-            tmpfile = tempfile.TemporaryFile(mode='w+b')
-            tmpfile.write(result.content)
-
            article.image.save(
                os.path.basename(article.original_image),
-                File(tmpfile),
+                File(BytesIO(result.content)),
            )
-
-            file_path = f'{settings.MEDIA_ROOT}/{article.image.file}'
-            render_variations(file_path, image_variations, replace=True)
            article.save()
        except Exception as e:
            logger.exception(result)
-        finally:
-            tmpfile.close()


 def get_active_feeds(feed=False):
--- a/mhackspace/feeds/reader.py
+++ b/mhackspace/feeds/reader.py
@ -1,20 +1,34 @@
 import lxml
 import feedparser
+import datetime
 from io import StringIO
+from operator import itemgetter
 from lxml.html.clean import Cleaner

 from django.utils.html import escape

+filter_by_date_expire = datetime.datetime.now() - datetime.timedelta(
+    days=int(1.5 * 365)
+)

-namespaces = {}
-urls = [
-    "https://feeds.feedburner.com/projects-jl",
-    "https://hackaday.com/tag/emf-camp-2018/feed/",
-    "https://maidstone-hackspace.org.uk/blog/rss/",
-    "http://webboggles.com/feed/",
-    "https://blog.digitaloctave.com/rss.xml",
-]
-html_parser = lxml.etree.HTMLParser()
+
+def filter_by_tags(self, node, tags=None):
+    """filter the feed out by category tag, if no tags assume its pre filtered"""
+    if self.tags is None:
+        return True
+    for category in node.xpath("./category", namespaces=namespaces):
+        if category.text.lower() in self.tags:
+            return True
+    return False
+
+
+def filter_by_date(self, date):
+    """filter the feed out by date"""
+    if self.enable_date_filter is False:
+        return True
+    if date > self.filter_by_date_expire:
+        return True
+    return False


 def parse_content(content):
@ -29,25 +43,13 @@ def parse_content(content):


 def fetch_image_from_node_text(text):
+    html_parser = lxml.etree.HTMLParser()
    description = lxml.etree.parse(StringIO(text), html_parser)
    for image in description.xpath(".//img"):
        return image.get("src")
    return None


-def fetch_node_text(node, name, default=u""):
-    """fetch the text from the node we are given, we are working in unicode
-        so decode byte strings to unicode"""
-    result = node.xpath("./%s" % name)
-    if result is None or len(result) is 0:
-        return default
-
-    if type(result[-1].text) is str:
-        return result[-1].text.encode("utf-8")
-    else:
-        return result[-1].text
-
-
 def fetch_image(post, node, namespaces):
    """Try and get an image from an item in the feed, use various fall back methods"""
    if hasattr(post, "media_thumbnail"):
@ -62,21 +64,21 @@ def fetch_image(post, node, namespaces):
            return image

    # final attempt at getting an image from the item using description
-    result = fetch_node_text(node, "description")
-    if result:
-        image = fetch_image_from_node_text(result)
-        if image:
-            return image
+    image = fetch_image_from_node_text(post.description)
+    if image:
+        return image

    # no image so lets fall back to the channel image if it exists
    return None


 def fetch_feeds(feeds):
-    articles = []
+    articles = {}

+    print(feeds)
    for feed in feeds:
        url = feed.get("url")
+        author = feed.get("author")
        parsed = feedparser.parse(url)
        namespaces = {}
        if hasattr(parsed, "namespaces"):
@ -84,16 +86,52 @@ def fetch_feeds(feeds):
        feed_image = ""
        if hasattr(parsed.feed, "image"):
            feed_image = parsed.feed.image.get("href")
+        print(author)
        for post in parsed.entries:
            root_node = parse_content(post.description)
            image = fetch_image(post, root_node, namespaces) or feed_image
-            yield {
-                "url": post.link,
-                "feed": feed.get("id"),
-                "title": post.title,
-                "original_image": image,
-                "description": post.description,
-                "date": post.published_parsed,
-                "image": image,
-            }
-    return articles
+
+            articles.setdefault(author, []).append(
+                {
+                    "url": post.link,
+                    "feed": feed.get("id"),
+                    "title": post.title,
+                    "original_image": image,
+                    "description": post.description,
+                    "date": post.published_parsed,
+                    "image": image,
+                }
+            )
+
+    # order authors articles by date
+    for author in articles.keys():
+        articles[author] = sorted(
+            articles[author], key=itemgetter("date"), reverse=True
+        )
+    return [f for f in alternate_dict_and_sort_by_list_item_key(articles)]
+
+    # return articles
+
+
+def alternate_dict_and_sort_by_list_item_key(dict_of_lists, sort_key="date"):
+    """ take a dictonary of ordered lists, step through each row and sort the current
+    item position in each list and yield the result.
+
+    basically gives the ordering of date while stepping through the blog entries to make it fair
+    for people who do not blog often. """
+    longest_list_length = max(
+        [len(dict_of_lists[d]) for d in dict_of_lists.keys()] + [0]
+    )
+
+    # order each feed by date, newest date at the end of the list so it can be poped
+    for author in dict_of_lists:
+        dict_of_lists[author].sort(key=itemgetter("date"), reverse=False)
+
+        # now iterate through author lists, popping the first elements and order the current item
+        # from each list by date
+        for i in range(0, longest_list_length):
+            # get first value from each key, and order the list by sort key which is date by default
+            feed_row = [d.pop() for d in dict_of_lists.values() if d]
+            results = sorted(feed_row, key=itemgetter(sort_key), reverse=True)
+            for item in results:
+                yield item
--- a/mhackspace/feeds/templatetags/feed_views.py
+++ b/mhackspace/feeds/templatetags/feed_views.py
@ -4,6 +4,11 @@ from mhackspace.feeds.models import Article

 register = template.Library()

-@register.inclusion_tag('feeds/list.html')
+
+@register.inclusion_tag("feeds/list.html")
 def show_feeds():
-    return {'articles': Article.objects.select_related('feed').filter(displayed=True, feed__enabled=True)}
+    return {
+        "articles": Article.objects.select_related("feed").filter(
+            displayed=True, feed__enabled=True
+        )
+    }