From 3b9b943db1efec2e9871d3e98084b446944cd19a Mon Sep 17 00:00:00 2001
From: Oliver Marks <oly@digitaloctave.com>
Date: Thu, 20 Sep 2018 20:53:48 +0100
Subject: [PATCH] More fixes to the feed parser, implemented ordering again

---
 mhackspace/feeds/helper.py                  |  32 +++---
 mhackspace/feeds/reader.py                  | 114 +++++++++++++-------
 mhackspace/feeds/templatetags/feed_views.py |   9 +-
 3 files changed, 95 insertions(+), 60 deletions(-)

diff --git a/mhackspace/feeds/helper.py b/mhackspace/feeds/helper.py
index 84f9f4d..cf2cb81 100644
--- a/mhackspace/feeds/helper.py
+++ b/mhackspace/feeds/helper.py
@@ -1,26 +1,27 @@
 # -*- coding: utf-8 -*-
 import os
-import tempfile
 import requests
 import logging
+from io import BytesIO
 from time import mktime
 from datetime import datetime
-from django.conf import settings
-from django.core.files import File
-from stdimage.utils import render_variations
-from mhackspace.feeds.reader import fetch_feeds
 
-from mhackspace.feeds.models import Feed, Article, image_variations
+from django.core.files import File
+
+from mhackspace.feeds.reader import fetch_feeds
+from mhackspace.feeds.models import Feed, Article
 
 logger = logging.getLogger(__name__)
 
 
 def import_feeds(feed=False):
     remove_old_articles()
-    articles = []
-    for article in fetch_feeds(get_active_feeds(feed)):
+    articles = fetch_feeds(get_active_feeds(feed))
+    article_objects = []
+    # for author in articles:
+    for article in articles:
         date = datetime.fromtimestamp(mktime(article["date"]))
-        articles.append(
+        article_objects.append(
             Article(
                 url=article["url"],
                 feed=Feed.objects.get(pk=article["feed"]),
@@ -30,7 +31,7 @@ def import_feeds(feed=False):
                 date=date,
             )
         )
-    articles = Article.objects.bulk_create(articles)
+    articles = Article.objects.bulk_create(article_objects)
     download_remote_images()
     return articles
 
@@ -43,7 +44,6 @@ def remove_old_articles():
 
 def download_remote_images():
     for article in Article.objects.all():
-        print(article.original_image)
         if not article.original_image:
             continue
         try:
@@ -57,21 +57,13 @@ def download_remote_images():
             return
 
         try:
-            tmpfile = tempfile.TemporaryFile(mode='w+b')
-            tmpfile.write(result.content)
-
             article.image.save(
                 os.path.basename(article.original_image),
-                File(tmpfile),
+                File(BytesIO(result.content)),
             )
-
-            file_path = f'{settings.MEDIA_ROOT}/{article.image.file}'
-            render_variations(file_path, image_variations, replace=True)
             article.save()
         except Exception as e:
             logger.exception(result)
-        finally:
-            tmpfile.close()
 
 
 def get_active_feeds(feed=False):
diff --git a/mhackspace/feeds/reader.py b/mhackspace/feeds/reader.py
index fedd6cf..fc388ed 100644
--- a/mhackspace/feeds/reader.py
+++ b/mhackspace/feeds/reader.py
@@ -1,20 +1,34 @@
 import lxml
 import feedparser
+import datetime
 from io import StringIO
+from operator import itemgetter
 from lxml.html.clean import Cleaner
 
 from django.utils.html import escape
 
+filter_by_date_expire = datetime.datetime.now() - datetime.timedelta(
+    days=int(1.5 * 365)
+)
 
-namespaces = {}
-urls = [
-    "https://feeds.feedburner.com/projects-jl",
-    "https://hackaday.com/tag/emf-camp-2018/feed/",
-    "https://maidstone-hackspace.org.uk/blog/rss/",
-    "http://webboggles.com/feed/",
-    "https://blog.digitaloctave.com/rss.xml",
-]
-html_parser = lxml.etree.HTMLParser()
+
+def filter_by_tags(self, node, tags=None):
+    """filter the feed out by category tag, if no tags assume its pre filtered"""
+    if self.tags is None:
+        return True
+    for category in node.xpath("./category", namespaces=namespaces):
+        if category.text.lower() in self.tags:
+            return True
+    return False
+
+
+def filter_by_date(self, date):
+    """filter the feed out by date"""
+    if self.enable_date_filter is False:
+        return True
+    if date > self.filter_by_date_expire:
+        return True
+    return False
 
 
 def parse_content(content):
@@ -29,25 +43,13 @@ def parse_content(content):
 
 
 def fetch_image_from_node_text(text):
+    html_parser = lxml.etree.HTMLParser()
     description = lxml.etree.parse(StringIO(text), html_parser)
     for image in description.xpath(".//img"):
         return image.get("src")
     return None
 
 
-def fetch_node_text(node, name, default=u""):
-    """fetch the text from the node we are given, we are working in unicode
-        so decode byte strings to unicode"""
-    result = node.xpath("./%s" % name)
-    if result is None or len(result) is 0:
-        return default
-
-    if type(result[-1].text) is str:
-        return result[-1].text.encode("utf-8")
-    else:
-        return result[-1].text
-
-
 def fetch_image(post, node, namespaces):
     """Try and get an image from an item in the feed, use various fall back methods"""
     if hasattr(post, "media_thumbnail"):
@@ -62,21 +64,21 @@ def fetch_image(post, node, namespaces):
             return image
 
     # final attempt at getting an image from the item using description
-    result = fetch_node_text(node, "description")
-    if result:
-        image = fetch_image_from_node_text(result)
-        if image:
-            return image
+    image = fetch_image_from_node_text(post.description)
+    if image:
+        return image
 
     # no image so lets fall back to the channel image if it exists
     return None
 
 
 def fetch_feeds(feeds):
-    articles = []
+    articles = {}
 
+    print(feeds)
     for feed in feeds:
         url = feed.get("url")
+        author = feed.get("author")
         parsed = feedparser.parse(url)
         namespaces = {}
         if hasattr(parsed, "namespaces"):
@@ -84,16 +86,52 @@ def fetch_feeds(feeds):
         feed_image = ""
         if hasattr(parsed.feed, "image"):
             feed_image = parsed.feed.image.get("href")
+        print(author)
         for post in parsed.entries:
             root_node = parse_content(post.description)
             image = fetch_image(post, root_node, namespaces) or feed_image
-            yield {
-                "url": post.link,
-                "feed": feed.get("id"),
-                "title": post.title,
-                "original_image": image,
-                "description": post.description,
-                "date": post.published_parsed,
-                "image": image,
-            }
-    return articles
+
+            articles.setdefault(author, []).append(
+                {
+                    "url": post.link,
+                    "feed": feed.get("id"),
+                    "title": post.title,
+                    "original_image": image,
+                    "description": post.description,
+                    "date": post.published_parsed,
+                    "image": image,
+                }
+            )
+
+    # order authors articles by date
+    for author in articles.keys():
+        articles[author] = sorted(
+            articles[author], key=itemgetter("date"), reverse=True
+        )
+    return [f for f in alternate_dict_and_sort_by_list_item_key(articles)]
+
+    # return articles
+
+
+def alternate_dict_and_sort_by_list_item_key(dict_of_lists, sort_key="date"):
+    """ take a dictonary of ordered lists, step through each row and sort the current
+    item position in each list and yield the result.
+
+    basically gives the ordering of date while stepping through the blog entries to make it fair
+    for people who do not blog often. """
+    longest_list_length = max(
+        [len(dict_of_lists[d]) for d in dict_of_lists.keys()] + [0]
+    )
+
+    # order each feed by date, newest date at the end of the list so it can be poped
+    for author in dict_of_lists:
+        dict_of_lists[author].sort(key=itemgetter("date"), reverse=False)
+
+        # now iterate through author lists, popping the first elements and order the current item
+        # from each list by date
+        for i in range(0, longest_list_length):
+            # get first value from each key, and order the list by sort key which is date by default
+            feed_row = [d.pop() for d in dict_of_lists.values() if d]
+            results = sorted(feed_row, key=itemgetter(sort_key), reverse=True)
+            for item in results:
+                yield item
diff --git a/mhackspace/feeds/templatetags/feed_views.py b/mhackspace/feeds/templatetags/feed_views.py
index ca12647..51650c8 100644
--- a/mhackspace/feeds/templatetags/feed_views.py
+++ b/mhackspace/feeds/templatetags/feed_views.py
@@ -4,6 +4,11 @@ from mhackspace.feeds.models import Article
 
 register = template.Library()
 
-@register.inclusion_tag('feeds/list.html')
+
+@register.inclusion_tag("feeds/list.html")
 def show_feeds():
-    return {'articles': Article.objects.select_related('feed').filter(displayed=True, feed__enabled=True)}
+    return {
+        "articles": Article.objects.select_related("feed").filter(
+            displayed=True, feed__enabled=True
+        )
+    }