More fixes to the feed parser, implemented ordering again
This commit is contained in:
parent
b3cd4aab0b
commit
3b9b943db1
|
@ -1,26 +1,27 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import os
|
import os
|
||||||
import tempfile
|
|
||||||
import requests
|
import requests
|
||||||
import logging
|
import logging
|
||||||
|
from io import BytesIO
|
||||||
from time import mktime
|
from time import mktime
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from django.conf import settings
|
|
||||||
from django.core.files import File
|
|
||||||
from stdimage.utils import render_variations
|
|
||||||
from mhackspace.feeds.reader import fetch_feeds
|
|
||||||
|
|
||||||
from mhackspace.feeds.models import Feed, Article, image_variations
|
from django.core.files import File
|
||||||
|
|
||||||
|
from mhackspace.feeds.reader import fetch_feeds
|
||||||
|
from mhackspace.feeds.models import Feed, Article
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def import_feeds(feed=False):
|
def import_feeds(feed=False):
|
||||||
remove_old_articles()
|
remove_old_articles()
|
||||||
articles = []
|
articles = fetch_feeds(get_active_feeds(feed))
|
||||||
for article in fetch_feeds(get_active_feeds(feed)):
|
article_objects = []
|
||||||
|
# for author in articles:
|
||||||
|
for article in articles:
|
||||||
date = datetime.fromtimestamp(mktime(article["date"]))
|
date = datetime.fromtimestamp(mktime(article["date"]))
|
||||||
articles.append(
|
article_objects.append(
|
||||||
Article(
|
Article(
|
||||||
url=article["url"],
|
url=article["url"],
|
||||||
feed=Feed.objects.get(pk=article["feed"]),
|
feed=Feed.objects.get(pk=article["feed"]),
|
||||||
|
@ -30,7 +31,7 @@ def import_feeds(feed=False):
|
||||||
date=date,
|
date=date,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
articles = Article.objects.bulk_create(articles)
|
articles = Article.objects.bulk_create(article_objects)
|
||||||
download_remote_images()
|
download_remote_images()
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
|
@ -43,7 +44,6 @@ def remove_old_articles():
|
||||||
|
|
||||||
def download_remote_images():
|
def download_remote_images():
|
||||||
for article in Article.objects.all():
|
for article in Article.objects.all():
|
||||||
print(article.original_image)
|
|
||||||
if not article.original_image:
|
if not article.original_image:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
|
@ -57,21 +57,13 @@ def download_remote_images():
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tmpfile = tempfile.TemporaryFile(mode='w+b')
|
|
||||||
tmpfile.write(result.content)
|
|
||||||
|
|
||||||
article.image.save(
|
article.image.save(
|
||||||
os.path.basename(article.original_image),
|
os.path.basename(article.original_image),
|
||||||
File(tmpfile),
|
File(BytesIO(result.content)),
|
||||||
)
|
)
|
||||||
|
|
||||||
file_path = f'{settings.MEDIA_ROOT}/{article.image.file}'
|
|
||||||
render_variations(file_path, image_variations, replace=True)
|
|
||||||
article.save()
|
article.save()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception(result)
|
logger.exception(result)
|
||||||
finally:
|
|
||||||
tmpfile.close()
|
|
||||||
|
|
||||||
|
|
||||||
def get_active_feeds(feed=False):
|
def get_active_feeds(feed=False):
|
||||||
|
|
|
@ -1,20 +1,34 @@
|
||||||
import lxml
|
import lxml
|
||||||
import feedparser
|
import feedparser
|
||||||
|
import datetime
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
from operator import itemgetter
|
||||||
from lxml.html.clean import Cleaner
|
from lxml.html.clean import Cleaner
|
||||||
|
|
||||||
from django.utils.html import escape
|
from django.utils.html import escape
|
||||||
|
|
||||||
|
filter_by_date_expire = datetime.datetime.now() - datetime.timedelta(
|
||||||
|
days=int(1.5 * 365)
|
||||||
|
)
|
||||||
|
|
||||||
namespaces = {}
|
|
||||||
urls = [
|
def filter_by_tags(self, node, tags=None):
|
||||||
"https://feeds.feedburner.com/projects-jl",
|
"""filter the feed out by category tag, if no tags assume its pre filtered"""
|
||||||
"https://hackaday.com/tag/emf-camp-2018/feed/",
|
if self.tags is None:
|
||||||
"https://maidstone-hackspace.org.uk/blog/rss/",
|
return True
|
||||||
"http://webboggles.com/feed/",
|
for category in node.xpath("./category", namespaces=namespaces):
|
||||||
"https://blog.digitaloctave.com/rss.xml",
|
if category.text.lower() in self.tags:
|
||||||
]
|
return True
|
||||||
html_parser = lxml.etree.HTMLParser()
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def filter_by_date(self, date):
|
||||||
|
"""filter the feed out by date"""
|
||||||
|
if self.enable_date_filter is False:
|
||||||
|
return True
|
||||||
|
if date > self.filter_by_date_expire:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def parse_content(content):
|
def parse_content(content):
|
||||||
|
@ -29,25 +43,13 @@ def parse_content(content):
|
||||||
|
|
||||||
|
|
||||||
def fetch_image_from_node_text(text):
|
def fetch_image_from_node_text(text):
|
||||||
|
html_parser = lxml.etree.HTMLParser()
|
||||||
description = lxml.etree.parse(StringIO(text), html_parser)
|
description = lxml.etree.parse(StringIO(text), html_parser)
|
||||||
for image in description.xpath(".//img"):
|
for image in description.xpath(".//img"):
|
||||||
return image.get("src")
|
return image.get("src")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def fetch_node_text(node, name, default=u""):
|
|
||||||
"""fetch the text from the node we are given, we are working in unicode
|
|
||||||
so decode byte strings to unicode"""
|
|
||||||
result = node.xpath("./%s" % name)
|
|
||||||
if result is None or len(result) is 0:
|
|
||||||
return default
|
|
||||||
|
|
||||||
if type(result[-1].text) is str:
|
|
||||||
return result[-1].text.encode("utf-8")
|
|
||||||
else:
|
|
||||||
return result[-1].text
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_image(post, node, namespaces):
|
def fetch_image(post, node, namespaces):
|
||||||
"""Try and get an image from an item in the feed, use various fall back methods"""
|
"""Try and get an image from an item in the feed, use various fall back methods"""
|
||||||
if hasattr(post, "media_thumbnail"):
|
if hasattr(post, "media_thumbnail"):
|
||||||
|
@ -62,9 +64,7 @@ def fetch_image(post, node, namespaces):
|
||||||
return image
|
return image
|
||||||
|
|
||||||
# final attempt at getting an image from the item using description
|
# final attempt at getting an image from the item using description
|
||||||
result = fetch_node_text(node, "description")
|
image = fetch_image_from_node_text(post.description)
|
||||||
if result:
|
|
||||||
image = fetch_image_from_node_text(result)
|
|
||||||
if image:
|
if image:
|
||||||
return image
|
return image
|
||||||
|
|
||||||
|
@ -73,10 +73,12 @@ def fetch_image(post, node, namespaces):
|
||||||
|
|
||||||
|
|
||||||
def fetch_feeds(feeds):
|
def fetch_feeds(feeds):
|
||||||
articles = []
|
articles = {}
|
||||||
|
|
||||||
|
print(feeds)
|
||||||
for feed in feeds:
|
for feed in feeds:
|
||||||
url = feed.get("url")
|
url = feed.get("url")
|
||||||
|
author = feed.get("author")
|
||||||
parsed = feedparser.parse(url)
|
parsed = feedparser.parse(url)
|
||||||
namespaces = {}
|
namespaces = {}
|
||||||
if hasattr(parsed, "namespaces"):
|
if hasattr(parsed, "namespaces"):
|
||||||
|
@ -84,10 +86,13 @@ def fetch_feeds(feeds):
|
||||||
feed_image = ""
|
feed_image = ""
|
||||||
if hasattr(parsed.feed, "image"):
|
if hasattr(parsed.feed, "image"):
|
||||||
feed_image = parsed.feed.image.get("href")
|
feed_image = parsed.feed.image.get("href")
|
||||||
|
print(author)
|
||||||
for post in parsed.entries:
|
for post in parsed.entries:
|
||||||
root_node = parse_content(post.description)
|
root_node = parse_content(post.description)
|
||||||
image = fetch_image(post, root_node, namespaces) or feed_image
|
image = fetch_image(post, root_node, namespaces) or feed_image
|
||||||
yield {
|
|
||||||
|
articles.setdefault(author, []).append(
|
||||||
|
{
|
||||||
"url": post.link,
|
"url": post.link,
|
||||||
"feed": feed.get("id"),
|
"feed": feed.get("id"),
|
||||||
"title": post.title,
|
"title": post.title,
|
||||||
|
@ -96,4 +101,37 @@ def fetch_feeds(feeds):
|
||||||
"date": post.published_parsed,
|
"date": post.published_parsed,
|
||||||
"image": image,
|
"image": image,
|
||||||
}
|
}
|
||||||
return articles
|
)
|
||||||
|
|
||||||
|
# order authors articles by date
|
||||||
|
for author in articles.keys():
|
||||||
|
articles[author] = sorted(
|
||||||
|
articles[author], key=itemgetter("date"), reverse=True
|
||||||
|
)
|
||||||
|
return [f for f in alternate_dict_and_sort_by_list_item_key(articles)]
|
||||||
|
|
||||||
|
# return articles
|
||||||
|
|
||||||
|
|
||||||
|
def alternate_dict_and_sort_by_list_item_key(dict_of_lists, sort_key="date"):
|
||||||
|
""" take a dictonary of ordered lists, step through each row and sort the current
|
||||||
|
item position in each list and yield the result.
|
||||||
|
|
||||||
|
basically gives the ordering of date while stepping through the blog entries to make it fair
|
||||||
|
for people who do not blog often. """
|
||||||
|
longest_list_length = max(
|
||||||
|
[len(dict_of_lists[d]) for d in dict_of_lists.keys()] + [0]
|
||||||
|
)
|
||||||
|
|
||||||
|
# order each feed by date, newest date at the end of the list so it can be poped
|
||||||
|
for author in dict_of_lists:
|
||||||
|
dict_of_lists[author].sort(key=itemgetter("date"), reverse=False)
|
||||||
|
|
||||||
|
# now iterate through author lists, popping the first elements and order the current item
|
||||||
|
# from each list by date
|
||||||
|
for i in range(0, longest_list_length):
|
||||||
|
# get first value from each key, and order the list by sort key which is date by default
|
||||||
|
feed_row = [d.pop() for d in dict_of_lists.values() if d]
|
||||||
|
results = sorted(feed_row, key=itemgetter(sort_key), reverse=True)
|
||||||
|
for item in results:
|
||||||
|
yield item
|
||||||
|
|
|
@ -4,6 +4,11 @@ from mhackspace.feeds.models import Article
|
||||||
|
|
||||||
register = template.Library()
|
register = template.Library()
|
||||||
|
|
||||||
@register.inclusion_tag('feeds/list.html')
|
|
||||||
|
@register.inclusion_tag("feeds/list.html")
|
||||||
def show_feeds():
|
def show_feeds():
|
||||||
return {'articles': Article.objects.select_related('feed').filter(displayed=True, feed__enabled=True)}
|
return {
|
||||||
|
"articles": Article.objects.select_related("feed").filter(
|
||||||
|
displayed=True, feed__enabled=True
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue