More fixes to the feed parser, implemented ordering again

This commit is contained in:
Oliver Marks 2018-09-20 20:53:48 +01:00
parent b3cd4aab0b
commit 3b9b943db1
3 changed files with 95 additions and 60 deletions

View File

@ -1,26 +1,27 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
import tempfile
import requests import requests
import logging import logging
from io import BytesIO
from time import mktime from time import mktime
from datetime import datetime from datetime import datetime
from django.conf import settings
from django.core.files import File
from stdimage.utils import render_variations
from mhackspace.feeds.reader import fetch_feeds
from mhackspace.feeds.models import Feed, Article, image_variations from django.core.files import File
from mhackspace.feeds.reader import fetch_feeds
from mhackspace.feeds.models import Feed, Article
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def import_feeds(feed=False): def import_feeds(feed=False):
remove_old_articles() remove_old_articles()
articles = [] articles = fetch_feeds(get_active_feeds(feed))
for article in fetch_feeds(get_active_feeds(feed)): article_objects = []
# for author in articles:
for article in articles:
date = datetime.fromtimestamp(mktime(article["date"])) date = datetime.fromtimestamp(mktime(article["date"]))
articles.append( article_objects.append(
Article( Article(
url=article["url"], url=article["url"],
feed=Feed.objects.get(pk=article["feed"]), feed=Feed.objects.get(pk=article["feed"]),
@ -30,7 +31,7 @@ def import_feeds(feed=False):
date=date, date=date,
) )
) )
articles = Article.objects.bulk_create(articles) articles = Article.objects.bulk_create(article_objects)
download_remote_images() download_remote_images()
return articles return articles
@ -43,7 +44,6 @@ def remove_old_articles():
def download_remote_images(): def download_remote_images():
for article in Article.objects.all(): for article in Article.objects.all():
print(article.original_image)
if not article.original_image: if not article.original_image:
continue continue
try: try:
@ -57,21 +57,13 @@ def download_remote_images():
return return
try: try:
tmpfile = tempfile.TemporaryFile(mode='w+b')
tmpfile.write(result.content)
article.image.save( article.image.save(
os.path.basename(article.original_image), os.path.basename(article.original_image),
File(tmpfile), File(BytesIO(result.content)),
) )
file_path = f'{settings.MEDIA_ROOT}/{article.image.file}'
render_variations(file_path, image_variations, replace=True)
article.save() article.save()
except Exception as e: except Exception as e:
logger.exception(result) logger.exception(result)
finally:
tmpfile.close()
def get_active_feeds(feed=False): def get_active_feeds(feed=False):

View File

@ -1,20 +1,34 @@
import lxml import lxml
import feedparser import feedparser
import datetime
from io import StringIO from io import StringIO
from operator import itemgetter
from lxml.html.clean import Cleaner from lxml.html.clean import Cleaner
from django.utils.html import escape from django.utils.html import escape
filter_by_date_expire = datetime.datetime.now() - datetime.timedelta(
days=int(1.5 * 365)
)
namespaces = {}
urls = [ def filter_by_tags(self, node, tags=None):
"https://feeds.feedburner.com/projects-jl", """filter the feed out by category tag, if no tags assume its pre filtered"""
"https://hackaday.com/tag/emf-camp-2018/feed/", if self.tags is None:
"https://maidstone-hackspace.org.uk/blog/rss/", return True
"http://webboggles.com/feed/", for category in node.xpath("./category", namespaces=namespaces):
"https://blog.digitaloctave.com/rss.xml", if category.text.lower() in self.tags:
] return True
html_parser = lxml.etree.HTMLParser() return False
def filter_by_date(self, date):
"""filter the feed out by date"""
if self.enable_date_filter is False:
return True
if date > self.filter_by_date_expire:
return True
return False
def parse_content(content): def parse_content(content):
@ -29,25 +43,13 @@ def parse_content(content):
def fetch_image_from_node_text(text): def fetch_image_from_node_text(text):
html_parser = lxml.etree.HTMLParser()
description = lxml.etree.parse(StringIO(text), html_parser) description = lxml.etree.parse(StringIO(text), html_parser)
for image in description.xpath(".//img"): for image in description.xpath(".//img"):
return image.get("src") return image.get("src")
return None return None
def fetch_node_text(node, name, default=u""):
"""fetch the text from the node we are given, we are working in unicode
so decode byte strings to unicode"""
result = node.xpath("./%s" % name)
if result is None or len(result) is 0:
return default
if type(result[-1].text) is str:
return result[-1].text.encode("utf-8")
else:
return result[-1].text
def fetch_image(post, node, namespaces): def fetch_image(post, node, namespaces):
"""Try and get an image from an item in the feed, use various fall back methods""" """Try and get an image from an item in the feed, use various fall back methods"""
if hasattr(post, "media_thumbnail"): if hasattr(post, "media_thumbnail"):
@ -62,9 +64,7 @@ def fetch_image(post, node, namespaces):
return image return image
# final attempt at getting an image from the item using description # final attempt at getting an image from the item using description
result = fetch_node_text(node, "description") image = fetch_image_from_node_text(post.description)
if result:
image = fetch_image_from_node_text(result)
if image: if image:
return image return image
@ -73,10 +73,12 @@ def fetch_image(post, node, namespaces):
def fetch_feeds(feeds): def fetch_feeds(feeds):
articles = [] articles = {}
print(feeds)
for feed in feeds: for feed in feeds:
url = feed.get("url") url = feed.get("url")
author = feed.get("author")
parsed = feedparser.parse(url) parsed = feedparser.parse(url)
namespaces = {} namespaces = {}
if hasattr(parsed, "namespaces"): if hasattr(parsed, "namespaces"):
@ -84,10 +86,13 @@ def fetch_feeds(feeds):
feed_image = "" feed_image = ""
if hasattr(parsed.feed, "image"): if hasattr(parsed.feed, "image"):
feed_image = parsed.feed.image.get("href") feed_image = parsed.feed.image.get("href")
print(author)
for post in parsed.entries: for post in parsed.entries:
root_node = parse_content(post.description) root_node = parse_content(post.description)
image = fetch_image(post, root_node, namespaces) or feed_image image = fetch_image(post, root_node, namespaces) or feed_image
yield {
articles.setdefault(author, []).append(
{
"url": post.link, "url": post.link,
"feed": feed.get("id"), "feed": feed.get("id"),
"title": post.title, "title": post.title,
@ -96,4 +101,37 @@ def fetch_feeds(feeds):
"date": post.published_parsed, "date": post.published_parsed,
"image": image, "image": image,
} }
return articles )
# order authors articles by date
for author in articles.keys():
articles[author] = sorted(
articles[author], key=itemgetter("date"), reverse=True
)
return [f for f in alternate_dict_and_sort_by_list_item_key(articles)]
# return articles
def alternate_dict_and_sort_by_list_item_key(dict_of_lists, sort_key="date"):
""" take a dictonary of ordered lists, step through each row and sort the current
item position in each list and yield the result.
basically gives the ordering of date while stepping through the blog entries to make it fair
for people who do not blog often. """
longest_list_length = max(
[len(dict_of_lists[d]) for d in dict_of_lists.keys()] + [0]
)
# order each feed by date, newest date at the end of the list so it can be poped
for author in dict_of_lists:
dict_of_lists[author].sort(key=itemgetter("date"), reverse=False)
# now iterate through author lists, popping the first elements and order the current item
# from each list by date
for i in range(0, longest_list_length):
# get first value from each key, and order the list by sort key which is date by default
feed_row = [d.pop() for d in dict_of_lists.values() if d]
results = sorted(feed_row, key=itemgetter(sort_key), reverse=True)
for item in results:
yield item

View File

@ -4,6 +4,11 @@ from mhackspace.feeds.models import Article
register = template.Library() register = template.Library()
@register.inclusion_tag('feeds/list.html')
@register.inclusion_tag("feeds/list.html")
def show_feeds(): def show_feeds():
return {'articles': Article.objects.select_related('feed').filter(displayed=True, feed__enabled=True)} return {
"articles": Article.objects.select_related("feed").filter(
displayed=True, feed__enabled=True
)
}