More fixes to the feed parser, implemented ordering again
This commit is contained in:
parent
b3cd4aab0b
commit
3b9b943db1
|
@ -1,26 +1,27 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import tempfile
|
||||
import requests
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from time import mktime
|
||||
from datetime import datetime
|
||||
from django.conf import settings
|
||||
from django.core.files import File
|
||||
from stdimage.utils import render_variations
|
||||
from mhackspace.feeds.reader import fetch_feeds
|
||||
|
||||
from mhackspace.feeds.models import Feed, Article, image_variations
|
||||
from django.core.files import File
|
||||
|
||||
from mhackspace.feeds.reader import fetch_feeds
|
||||
from mhackspace.feeds.models import Feed, Article
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def import_feeds(feed=False):
|
||||
remove_old_articles()
|
||||
articles = []
|
||||
for article in fetch_feeds(get_active_feeds(feed)):
|
||||
articles = fetch_feeds(get_active_feeds(feed))
|
||||
article_objects = []
|
||||
# for author in articles:
|
||||
for article in articles:
|
||||
date = datetime.fromtimestamp(mktime(article["date"]))
|
||||
articles.append(
|
||||
article_objects.append(
|
||||
Article(
|
||||
url=article["url"],
|
||||
feed=Feed.objects.get(pk=article["feed"]),
|
||||
|
@ -30,7 +31,7 @@ def import_feeds(feed=False):
|
|||
date=date,
|
||||
)
|
||||
)
|
||||
articles = Article.objects.bulk_create(articles)
|
||||
articles = Article.objects.bulk_create(article_objects)
|
||||
download_remote_images()
|
||||
return articles
|
||||
|
||||
|
@ -43,7 +44,6 @@ def remove_old_articles():
|
|||
|
||||
def download_remote_images():
|
||||
for article in Article.objects.all():
|
||||
print(article.original_image)
|
||||
if not article.original_image:
|
||||
continue
|
||||
try:
|
||||
|
@ -57,21 +57,13 @@ def download_remote_images():
|
|||
return
|
||||
|
||||
try:
|
||||
tmpfile = tempfile.TemporaryFile(mode='w+b')
|
||||
tmpfile.write(result.content)
|
||||
|
||||
article.image.save(
|
||||
os.path.basename(article.original_image),
|
||||
File(tmpfile),
|
||||
File(BytesIO(result.content)),
|
||||
)
|
||||
|
||||
file_path = f'{settings.MEDIA_ROOT}/{article.image.file}'
|
||||
render_variations(file_path, image_variations, replace=True)
|
||||
article.save()
|
||||
except Exception as e:
|
||||
logger.exception(result)
|
||||
finally:
|
||||
tmpfile.close()
|
||||
|
||||
|
||||
def get_active_feeds(feed=False):
|
||||
|
|
|
@ -1,20 +1,34 @@
|
|||
import lxml
|
||||
import feedparser
|
||||
import datetime
|
||||
from io import StringIO
|
||||
from operator import itemgetter
|
||||
from lxml.html.clean import Cleaner
|
||||
|
||||
from django.utils.html import escape
|
||||
|
||||
filter_by_date_expire = datetime.datetime.now() - datetime.timedelta(
|
||||
days=int(1.5 * 365)
|
||||
)
|
||||
|
||||
namespaces = {}
|
||||
urls = [
|
||||
"https://feeds.feedburner.com/projects-jl",
|
||||
"https://hackaday.com/tag/emf-camp-2018/feed/",
|
||||
"https://maidstone-hackspace.org.uk/blog/rss/",
|
||||
"http://webboggles.com/feed/",
|
||||
"https://blog.digitaloctave.com/rss.xml",
|
||||
]
|
||||
html_parser = lxml.etree.HTMLParser()
|
||||
|
||||
def filter_by_tags(self, node, tags=None):
|
||||
"""filter the feed out by category tag, if no tags assume its pre filtered"""
|
||||
if self.tags is None:
|
||||
return True
|
||||
for category in node.xpath("./category", namespaces=namespaces):
|
||||
if category.text.lower() in self.tags:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def filter_by_date(self, date):
|
||||
"""filter the feed out by date"""
|
||||
if self.enable_date_filter is False:
|
||||
return True
|
||||
if date > self.filter_by_date_expire:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def parse_content(content):
|
||||
|
@ -29,25 +43,13 @@ def parse_content(content):
|
|||
|
||||
|
||||
def fetch_image_from_node_text(text):
|
||||
html_parser = lxml.etree.HTMLParser()
|
||||
description = lxml.etree.parse(StringIO(text), html_parser)
|
||||
for image in description.xpath(".//img"):
|
||||
return image.get("src")
|
||||
return None
|
||||
|
||||
|
||||
def fetch_node_text(node, name, default=u""):
|
||||
"""fetch the text from the node we are given, we are working in unicode
|
||||
so decode byte strings to unicode"""
|
||||
result = node.xpath("./%s" % name)
|
||||
if result is None or len(result) is 0:
|
||||
return default
|
||||
|
||||
if type(result[-1].text) is str:
|
||||
return result[-1].text.encode("utf-8")
|
||||
else:
|
||||
return result[-1].text
|
||||
|
||||
|
||||
def fetch_image(post, node, namespaces):
|
||||
"""Try and get an image from an item in the feed, use various fall back methods"""
|
||||
if hasattr(post, "media_thumbnail"):
|
||||
|
@ -62,9 +64,7 @@ def fetch_image(post, node, namespaces):
|
|||
return image
|
||||
|
||||
# final attempt at getting an image from the item using description
|
||||
result = fetch_node_text(node, "description")
|
||||
if result:
|
||||
image = fetch_image_from_node_text(result)
|
||||
image = fetch_image_from_node_text(post.description)
|
||||
if image:
|
||||
return image
|
||||
|
||||
|
@ -73,10 +73,12 @@ def fetch_image(post, node, namespaces):
|
|||
|
||||
|
||||
def fetch_feeds(feeds):
|
||||
articles = []
|
||||
articles = {}
|
||||
|
||||
print(feeds)
|
||||
for feed in feeds:
|
||||
url = feed.get("url")
|
||||
author = feed.get("author")
|
||||
parsed = feedparser.parse(url)
|
||||
namespaces = {}
|
||||
if hasattr(parsed, "namespaces"):
|
||||
|
@ -84,10 +86,13 @@ def fetch_feeds(feeds):
|
|||
feed_image = ""
|
||||
if hasattr(parsed.feed, "image"):
|
||||
feed_image = parsed.feed.image.get("href")
|
||||
print(author)
|
||||
for post in parsed.entries:
|
||||
root_node = parse_content(post.description)
|
||||
image = fetch_image(post, root_node, namespaces) or feed_image
|
||||
yield {
|
||||
|
||||
articles.setdefault(author, []).append(
|
||||
{
|
||||
"url": post.link,
|
||||
"feed": feed.get("id"),
|
||||
"title": post.title,
|
||||
|
@ -96,4 +101,37 @@ def fetch_feeds(feeds):
|
|||
"date": post.published_parsed,
|
||||
"image": image,
|
||||
}
|
||||
return articles
|
||||
)
|
||||
|
||||
# order authors articles by date
|
||||
for author in articles.keys():
|
||||
articles[author] = sorted(
|
||||
articles[author], key=itemgetter("date"), reverse=True
|
||||
)
|
||||
return [f for f in alternate_dict_and_sort_by_list_item_key(articles)]
|
||||
|
||||
# return articles
|
||||
|
||||
|
||||
def alternate_dict_and_sort_by_list_item_key(dict_of_lists, sort_key="date"):
|
||||
""" take a dictonary of ordered lists, step through each row and sort the current
|
||||
item position in each list and yield the result.
|
||||
|
||||
basically gives the ordering of date while stepping through the blog entries to make it fair
|
||||
for people who do not blog often. """
|
||||
longest_list_length = max(
|
||||
[len(dict_of_lists[d]) for d in dict_of_lists.keys()] + [0]
|
||||
)
|
||||
|
||||
# order each feed by date, newest date at the end of the list so it can be poped
|
||||
for author in dict_of_lists:
|
||||
dict_of_lists[author].sort(key=itemgetter("date"), reverse=False)
|
||||
|
||||
# now iterate through author lists, popping the first elements and order the current item
|
||||
# from each list by date
|
||||
for i in range(0, longest_list_length):
|
||||
# get first value from each key, and order the list by sort key which is date by default
|
||||
feed_row = [d.pop() for d in dict_of_lists.values() if d]
|
||||
results = sorted(feed_row, key=itemgetter(sort_key), reverse=True)
|
||||
for item in results:
|
||||
yield item
|
||||
|
|
|
@ -4,6 +4,11 @@ from mhackspace.feeds.models import Article
|
|||
|
||||
register = template.Library()
|
||||
|
||||
@register.inclusion_tag('feeds/list.html')
|
||||
|
||||
@register.inclusion_tag("feeds/list.html")
|
||||
def show_feeds():
|
||||
return {'articles': Article.objects.select_related('feed').filter(displayed=True, feed__enabled=True)}
|
||||
return {
|
||||
"articles": Article.objects.select_related("feed").filter(
|
||||
displayed=True, feed__enabled=True
|
||||
)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue