hackspace/mhackspace/feeds/reader.py

import lxml
import feedparser
from operator import itemgetter
from lxml import etree
from lxml.html.clean import Cleaner
from io import StringIO, BytesIO

from django.utils.html import escape


namespaces = {}
urls = [
    "https://feeds.feedburner.com/projects-jl",
    "https://hackaday.com/tag/emf-camp-2018/feed/",
    "https://maidstone-hackspace.org.uk/blog/rss/",
    "http://webboggles.com/feed/",
    "https://blog.digitaloctave.com/rss.xml",
]
html_parser = lxml.etree.HTMLParser()


def parse_content(content):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0"
    }

    html_img_cleaner = Cleaner(allow_tags=["img"], remove_unknown_tags=False)
    html_img_cleaner.allow_tags = ["img"]

    xml_parser = lxml.etree.XMLParser(
        remove_blank_text=True, ns_clean=True, encoding="utf-8"
    )

    return lxml.etree.XML("<div>" + escape(content) + "</div>", xml_parser)


def fetch_image_from_node_text(text):
    description = lxml.etree.parse(text, html_parser)
    for image in description.xpath(".//img"):
        print("fetch image from node text")
        return image.get("src")
    return None


def fetch_node_text(node, name, default=u""):
    """fetch the text from the node we are given, we are working in unicode
        so decode byte strings to unicode"""
    result = node.xpath("./%s" % name)
    if result is None or len(result) is 0:
        return default

    if type(result[-1].text) is str:
        return result[-1].text.encode("utf-8")
    else:
        return result[-1].text


def fetch_image(post, node, namespaces):
    """Try and get an image from an item in the feed, use various fall back methods"""
    if hasattr(post, "media_thumbnail"):
        print("media")
        image = post.media_thumbnail
        print(image)
        if image:
            return image[0].get("url")

    if hasattr(post, "content"):
        print("content")
        content = " ".join(c.value for c in post.content)
        image = fetch_image_from_node_text(content)
        if image:
            return image

    # final attempt at getting an image from the item using description
    result = fetch_node_text(node, "description")
    if result:
        print("description")
        image = fetch_image_from_node_text(result)
        if image:
            return image

    # no image so lets fall back to the channel image if it exists
    return None


def fetch_feeds(feeds):
    articles = []

    for feed in feeds:
        url = feed.get("url")
        parsed = feedparser.parse(url)
        namespaces = {}
        if hasattr(parsed, "namespaces"):
            namespaces = parsed.namespaces
        feed_image = ""
        if hasattr(parsed.feed, "image"):
            feed_image = parsed.feed.image.get("href")
        for post in parsed.entries:
            root_node = parse_content(post.description)
            image = fetch_image(post, root_node, namespaces) or feed_image
            yield {
                "url": post.link,
                "feed": feed.get("id"),
                "title": post.title,
                "original_image": image,
                "description": post.description,
                "date": post.published_parsed,
                "image": image,
            }
    return articles