111 lines
3.2 KiB
Python
111 lines
3.2 KiB
Python
import lxml
|
|
import feedparser
|
|
from operator import itemgetter
|
|
from lxml import etree
|
|
from lxml.html.clean import Cleaner
|
|
from io import StringIO, BytesIO
|
|
|
|
from django.utils.html import escape
|
|
|
|
|
|
namespaces = {}
|
|
urls = [
|
|
"https://feeds.feedburner.com/projects-jl",
|
|
"https://hackaday.com/tag/emf-camp-2018/feed/",
|
|
"https://maidstone-hackspace.org.uk/blog/rss/",
|
|
"http://webboggles.com/feed/",
|
|
"https://blog.digitaloctave.com/rss.xml",
|
|
]
|
|
html_parser = lxml.etree.HTMLParser()
|
|
|
|
|
|
def parse_content(content):
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0"
|
|
}
|
|
|
|
html_img_cleaner = Cleaner(allow_tags=["img"], remove_unknown_tags=False)
|
|
html_img_cleaner.allow_tags = ["img"]
|
|
|
|
xml_parser = lxml.etree.XMLParser(
|
|
remove_blank_text=True, ns_clean=True, encoding="utf-8"
|
|
)
|
|
|
|
return lxml.etree.XML("<div>" + escape(content) + "</div>", xml_parser)
|
|
|
|
|
|
def fetch_image_from_node_text(text):
|
|
description = lxml.etree.parse(text, html_parser)
|
|
for image in description.xpath(".//img"):
|
|
print("fetch image from node text")
|
|
return image.get("src")
|
|
return None
|
|
|
|
|
|
def fetch_node_text(node, name, default=u""):
|
|
"""fetch the text from the node we are given, we are working in unicode
|
|
so decode byte strings to unicode"""
|
|
result = node.xpath("./%s" % name)
|
|
if result is None or len(result) is 0:
|
|
return default
|
|
|
|
if type(result[-1].text) is str:
|
|
return result[-1].text.encode("utf-8")
|
|
else:
|
|
return result[-1].text
|
|
|
|
|
|
def fetch_image(post, node, namespaces):
|
|
"""Try and get an image from an item in the feed, use various fall back methods"""
|
|
if hasattr(post, "media_thumbnail"):
|
|
print("media")
|
|
image = post.media_thumbnail
|
|
print(image)
|
|
if image:
|
|
return image[0].get("url")
|
|
|
|
if hasattr(post, "content"):
|
|
print("content")
|
|
content = " ".join(c.value for c in post.content)
|
|
image = fetch_image_from_node_text(content)
|
|
if image:
|
|
return image
|
|
|
|
# final attempt at getting an image from the item using description
|
|
result = fetch_node_text(node, "description")
|
|
if result:
|
|
print("description")
|
|
image = fetch_image_from_node_text(result)
|
|
if image:
|
|
return image
|
|
|
|
# no image so lets fall back to the channel image if it exists
|
|
return None
|
|
|
|
|
|
def fetch_feeds(feeds):
|
|
articles = []
|
|
|
|
for feed in feeds:
|
|
url = feed.get("url")
|
|
parsed = feedparser.parse(url)
|
|
namespaces = {}
|
|
if hasattr(parsed, "namespaces"):
|
|
namespaces = parsed.namespaces
|
|
feed_image = ""
|
|
if hasattr(parsed.feed, "image"):
|
|
feed_image = parsed.feed.image.get("href")
|
|
for post in parsed.entries:
|
|
root_node = parse_content(post.description)
|
|
image = fetch_image(post, root_node, namespaces) or feed_image
|
|
yield {
|
|
"url": post.link,
|
|
"feed": feed.get("id"),
|
|
"title": post.title,
|
|
"original_image": image,
|
|
"description": post.description,
|
|
"date": post.published_parsed,
|
|
"image": image,
|
|
}
|
|
return articles
|