From d4357e3a7546e2f87ec9661bfdf584772edc7e87 Mon Sep 17 00:00:00 2001 From: Oliver Marks Date: Tue, 18 Sep 2018 20:11:45 +0100 Subject: [PATCH] Update feed parser to us different library --- mhackspace/contact/templatetags/recapture.py | 7 +-- mhackspace/feeds/helper.py | 25 +------- mhackspace/feeds/reader.py | 65 +++++++------------- requirements/base.txt | 14 ++--- 4 files changed, 33 insertions(+), 78 deletions(-) diff --git a/mhackspace/contact/templatetags/recapture.py b/mhackspace/contact/templatetags/recapture.py index d097d22..c3ce887 100644 --- a/mhackspace/contact/templatetags/recapture.py +++ b/mhackspace/contact/templatetags/recapture.py @@ -1,11 +1,10 @@ # -*- coding: utf-8 -*- from django import template -from mhackspace.feeds.models import Feed -from scaffold.readers.rss_reader import feed_reader from django.conf import settings register = template.Library() -@register.inclusion_tag('partials/recapture.html') + +@register.inclusion_tag("partials/recapture.html") def google_capture(): - return settings.CAPTCHA + return settings.CAPTCHA diff --git a/mhackspace/feeds/helper.py b/mhackspace/feeds/helper.py index baf12b4..b226931 100644 --- a/mhackspace/feeds/helper.py +++ b/mhackspace/feeds/helper.py @@ -1,53 +1,34 @@ # -*- coding: utf-8 -*- import os import logging -import feedparser from time import mktime from datetime import datetime from urllib.request import urlretrieve from django.core.files import File -from django.utils.timezone import make_aware -from django.utils import timezone from stdimage.utils import render_variations from mhackspace.feeds.reader import fetch_feeds -# from scaffold.readers.rss_reader import feed_reader - from mhackspace.feeds.models import Feed, Article, image_variations logger = logging.getLogger(__name__) -def feed_reader(feeds): - for feed in feeds: - print(feed) - yield feedparser.parse(feed["url"]) - - def import_feeds(feed=False): remove_old_articles() - - print([f.get("url") for f in get_active_feeds(feed)]) - rss_articles = fetch_feeds(get_active_feeds(feed)) - articles = [] - for article in rss_articles: + for article in fetch_feeds(get_active_feeds(feed)): date = datetime.fromtimestamp(mktime(article["date"])) - print(article["title"]) - print(article["image"]) - print('#############') articles.append( Article( url=article["url"], feed=Feed.objects.get(pk=article["feed"]), - title=article["title"][0:100], - original_image=article["image"][0:100], + title=article["title"], + original_image=article["image"], description=article["description"], date=date, ) ) - articles = Article.objects.bulk_create(articles) download_remote_images() return articles diff --git a/mhackspace/feeds/reader.py b/mhackspace/feeds/reader.py index c0df1b4..3802f8c 100644 --- a/mhackspace/feeds/reader.py +++ b/mhackspace/feeds/reader.py @@ -5,6 +5,9 @@ from lxml import etree from lxml.html.clean import Cleaner from io import StringIO, BytesIO +from django.utils.html import escape + + namespaces = {} urls = [ "https://feeds.feedburner.com/projects-jl", @@ -20,18 +23,6 @@ def parse_content(content): headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0" } - html_cleaner = Cleaner() - html_cleaner.javascript = True - html_cleaner.style = True - html_cleaner.remove_tags = [ - "script", - "iframe", - "link", - "style", - "img", - "div", - ] - # ~ html_cleaner.allow_tags = ['a', 'p', 'strong'] html_img_cleaner = Cleaner(allow_tags=["img"], remove_unknown_tags=False) html_img_cleaner.allow_tags = ["img"] @@ -40,17 +31,13 @@ def parse_content(content): remove_blank_text=True, ns_clean=True, encoding="utf-8" ) - print("------------------") - print(content) - - dom = lxml.etree.XML("
" + content + "
", xml_parser) - return dom + return lxml.etree.XML("
" + escape(content) + "
", xml_parser) def fetch_image_from_node_text(text): description = lxml.etree.parse(text, html_parser) for image in description.xpath(".//img"): - print('fetch image from node text') + print("fetch image from node text") return image.get("src") return None @@ -71,16 +58,14 @@ def fetch_node_text(node, name, default=u""): def fetch_image(post, node, namespaces): """Try and get an image from an item in the feed, use various fall back methods""" if hasattr(post, "media_thumbnail"): - print('media') - + print("media") image = post.media_thumbnail print(image) - if image: return image[0].get("url") if hasattr(post, "content"): - print('content') + print("content") content = " ".join(c.value for c in post.content) image = fetch_image_from_node_text(content) if image: @@ -89,7 +74,7 @@ def fetch_image(post, node, namespaces): # final attempt at getting an image from the item using description result = fetch_node_text(node, "description") if result: - print('description') + print("description") image = fetch_image_from_node_text(result) if image: return image @@ -99,33 +84,27 @@ def fetch_image(post, node, namespaces): def fetch_feeds(feeds): + articles = [] + for feed in feeds: - url = feed.get('url') - print(url) + url = feed.get("url") parsed = feedparser.parse(url) namespaces = {} if hasattr(parsed, "namespaces"): namespaces = parsed.namespaces feed_image = "" if hasattr(parsed.feed, "image"): - feed_image = parsed.feed.image.get('href') - articles = [] + feed_image = parsed.feed.image.get("href") for post in parsed.entries: - print(post.published) - print(feed_image) root_node = parse_content(post.description) - image = fetch_image(post, root_node, namespaces) #or feed_image - - articles.append( - { - "url": post.link, - "feed": feed.get('id'), - "title": post.title, - "original_image": image, - "description": post.description, - "date": post.published_parsed, - "image": feed_image, - } - ) - print(articles[-1]) + image = fetch_image(post, root_node, namespaces) or feed_image + yield { + "url": post.link, + "feed": feed.get("id"), + "title": post.title, + "original_image": image, + "description": post.description, + "date": post.published_parsed, + "image": image, + } return articles diff --git a/requirements/base.txt b/requirements/base.txt index b2bbb7d..c90294d 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -8,10 +8,10 @@ django==2.1.1 django-dynamic-filenames==1.1.3 # Configuration django-environ==0.4.5 -whitenoise==4.0 +whitenoise==4.1 # Static and Media Storage # ------------------------------------------------ -boto3==1.9.1 +boto3==1.9.5 django-storages==1.7.1 # django-storages-redux==1.3.2 @@ -55,7 +55,7 @@ django-compressor==2.2 #fix for use with s3 buckets merged in master, so next release we can remove this #django-sass-processor==0.5.7 git+https://github.com/jrief/django-sass-processor.git -libsass==0.14.5 +libsass==0.15.0 lxml==4.2.5 # WSGI Handler @@ -72,9 +72,6 @@ gocardless_pro==1.8.0 braintree==3.48.0 django-autofixture==0.12.1 - -git+https://github.com/olymk2/scaffold.git -#git+git://github.com/olymk2/django-wiki.git git+git://github.com/django-wiki/django-wiki.git djangorestframework==3.8.2 @@ -83,8 +80,7 @@ django-filter==2.0.0 coreapi==2.3.3 # api libraries end -#martor==1.3.2 -git+git://github.com/olymk2/django-markdown-editor.git +martor==1.3.3 django-spirit==0.6.1 django-djconfig==0.8.0 @@ -103,4 +99,4 @@ python-magic==0.4.15 ldap3==2.5.1 bcrypt==3.1.4 python-twitter==3.4.2 -feedparser +feedparser==5.2.1