More fixes to the feed parser, implemented ordering again

This commit is contained in:
Oliver Marks 2018-09-20 20:53:48 +01:00
parent b3cd4aab0b
commit 3b9b943db1
3 changed files with 95 additions and 60 deletions

View File

@ -1,26 +1,27 @@
# -*- coding: utf-8 -*-
import os
import tempfile
import requests
import logging
from io import BytesIO
from time import mktime
from datetime import datetime
from django.conf import settings
from django.core.files import File
from stdimage.utils import render_variations
from mhackspace.feeds.reader import fetch_feeds
from mhackspace.feeds.models import Feed, Article, image_variations
from django.core.files import File
from mhackspace.feeds.reader import fetch_feeds
from mhackspace.feeds.models import Feed, Article
logger = logging.getLogger(__name__)
def import_feeds(feed=False):
remove_old_articles()
articles = []
for article in fetch_feeds(get_active_feeds(feed)):
articles = fetch_feeds(get_active_feeds(feed))
article_objects = []
# for author in articles:
for article in articles:
date = datetime.fromtimestamp(mktime(article["date"]))
articles.append(
article_objects.append(
Article(
url=article["url"],
feed=Feed.objects.get(pk=article["feed"]),
@ -30,7 +31,7 @@ def import_feeds(feed=False):
date=date,
)
)
articles = Article.objects.bulk_create(articles)
articles = Article.objects.bulk_create(article_objects)
download_remote_images()
return articles
@ -43,7 +44,6 @@ def remove_old_articles():
def download_remote_images():
for article in Article.objects.all():
print(article.original_image)
if not article.original_image:
continue
try:
@ -57,21 +57,13 @@ def download_remote_images():
return
try:
tmpfile = tempfile.TemporaryFile(mode='w+b')
tmpfile.write(result.content)
article.image.save(
os.path.basename(article.original_image),
File(tmpfile),
File(BytesIO(result.content)),
)
file_path = f'{settings.MEDIA_ROOT}/{article.image.file}'
render_variations(file_path, image_variations, replace=True)
article.save()
except Exception as e:
logger.exception(result)
finally:
tmpfile.close()
def get_active_feeds(feed=False):

View File

@ -1,20 +1,34 @@
import lxml
import feedparser
import datetime
from io import StringIO
from operator import itemgetter
from lxml.html.clean import Cleaner
from django.utils.html import escape
filter_by_date_expire = datetime.datetime.now() - datetime.timedelta(
days=int(1.5 * 365)
)
namespaces = {}
urls = [
"https://feeds.feedburner.com/projects-jl",
"https://hackaday.com/tag/emf-camp-2018/feed/",
"https://maidstone-hackspace.org.uk/blog/rss/",
"http://webboggles.com/feed/",
"https://blog.digitaloctave.com/rss.xml",
]
html_parser = lxml.etree.HTMLParser()
def filter_by_tags(self, node, tags=None):
"""filter the feed out by category tag, if no tags assume its pre filtered"""
if self.tags is None:
return True
for category in node.xpath("./category", namespaces=namespaces):
if category.text.lower() in self.tags:
return True
return False
def filter_by_date(self, date):
"""filter the feed out by date"""
if self.enable_date_filter is False:
return True
if date > self.filter_by_date_expire:
return True
return False
def parse_content(content):
@ -29,25 +43,13 @@ def parse_content(content):
def fetch_image_from_node_text(text):
html_parser = lxml.etree.HTMLParser()
description = lxml.etree.parse(StringIO(text), html_parser)
for image in description.xpath(".//img"):
return image.get("src")
return None
def fetch_node_text(node, name, default=u""):
"""fetch the text from the node we are given, we are working in unicode
so decode byte strings to unicode"""
result = node.xpath("./%s" % name)
if result is None or len(result) is 0:
return default
if type(result[-1].text) is str:
return result[-1].text.encode("utf-8")
else:
return result[-1].text
def fetch_image(post, node, namespaces):
"""Try and get an image from an item in the feed, use various fall back methods"""
if hasattr(post, "media_thumbnail"):
@ -62,21 +64,21 @@ def fetch_image(post, node, namespaces):
return image
# final attempt at getting an image from the item using description
result = fetch_node_text(node, "description")
if result:
image = fetch_image_from_node_text(result)
if image:
return image
image = fetch_image_from_node_text(post.description)
if image:
return image
# no image so lets fall back to the channel image if it exists
return None
def fetch_feeds(feeds):
articles = []
articles = {}
print(feeds)
for feed in feeds:
url = feed.get("url")
author = feed.get("author")
parsed = feedparser.parse(url)
namespaces = {}
if hasattr(parsed, "namespaces"):
@ -84,16 +86,52 @@ def fetch_feeds(feeds):
feed_image = ""
if hasattr(parsed.feed, "image"):
feed_image = parsed.feed.image.get("href")
print(author)
for post in parsed.entries:
root_node = parse_content(post.description)
image = fetch_image(post, root_node, namespaces) or feed_image
yield {
"url": post.link,
"feed": feed.get("id"),
"title": post.title,
"original_image": image,
"description": post.description,
"date": post.published_parsed,
"image": image,
}
return articles
articles.setdefault(author, []).append(
{
"url": post.link,
"feed": feed.get("id"),
"title": post.title,
"original_image": image,
"description": post.description,
"date": post.published_parsed,
"image": image,
}
)
# order authors articles by date
for author in articles.keys():
articles[author] = sorted(
articles[author], key=itemgetter("date"), reverse=True
)
return [f for f in alternate_dict_and_sort_by_list_item_key(articles)]
# return articles
def alternate_dict_and_sort_by_list_item_key(dict_of_lists, sort_key="date"):
""" take a dictonary of ordered lists, step through each row and sort the current
item position in each list and yield the result.
basically gives the ordering of date while stepping through the blog entries to make it fair
for people who do not blog often. """
longest_list_length = max(
[len(dict_of_lists[d]) for d in dict_of_lists.keys()] + [0]
)
# order each feed by date, newest date at the end of the list so it can be poped
for author in dict_of_lists:
dict_of_lists[author].sort(key=itemgetter("date"), reverse=False)
# now iterate through author lists, popping the first elements and order the current item
# from each list by date
for i in range(0, longest_list_length):
# get first value from each key, and order the list by sort key which is date by default
feed_row = [d.pop() for d in dict_of_lists.values() if d]
results = sorted(feed_row, key=itemgetter(sort_key), reverse=True)
for item in results:
yield item

View File

@ -4,6 +4,11 @@ from mhackspace.feeds.models import Article
register = template.Library()
@register.inclusion_tag('feeds/list.html')
@register.inclusion_tag("feeds/list.html")
def show_feeds():
return {'articles': Article.objects.select_related('feed').filter(displayed=True, feed__enabled=True)}
return {
"articles": Article.objects.select_related("feed").filter(
displayed=True, feed__enabled=True
)
}