Update feed parser to us different library

This commit is contained in:
Oliver Marks 2018-09-18 20:11:45 +01:00
parent aecd76b567
commit d4357e3a75
4 changed files with 33 additions and 78 deletions

View File

@ -1,11 +1,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from django import template from django import template
from mhackspace.feeds.models import Feed
from scaffold.readers.rss_reader import feed_reader
from django.conf import settings from django.conf import settings
register = template.Library() register = template.Library()
@register.inclusion_tag('partials/recapture.html')
@register.inclusion_tag("partials/recapture.html")
def google_capture(): def google_capture():
return settings.CAPTCHA return settings.CAPTCHA

View File

@ -1,53 +1,34 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
import logging import logging
import feedparser
from time import mktime from time import mktime
from datetime import datetime from datetime import datetime
from urllib.request import urlretrieve from urllib.request import urlretrieve
from django.core.files import File from django.core.files import File
from django.utils.timezone import make_aware
from django.utils import timezone
from stdimage.utils import render_variations from stdimage.utils import render_variations
from mhackspace.feeds.reader import fetch_feeds from mhackspace.feeds.reader import fetch_feeds
# from scaffold.readers.rss_reader import feed_reader
from mhackspace.feeds.models import Feed, Article, image_variations from mhackspace.feeds.models import Feed, Article, image_variations
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def feed_reader(feeds):
for feed in feeds:
print(feed)
yield feedparser.parse(feed["url"])
def import_feeds(feed=False): def import_feeds(feed=False):
remove_old_articles() remove_old_articles()
print([f.get("url") for f in get_active_feeds(feed)])
rss_articles = fetch_feeds(get_active_feeds(feed))
articles = [] articles = []
for article in rss_articles: for article in fetch_feeds(get_active_feeds(feed)):
date = datetime.fromtimestamp(mktime(article["date"])) date = datetime.fromtimestamp(mktime(article["date"]))
print(article["title"])
print(article["image"])
print('#############')
articles.append( articles.append(
Article( Article(
url=article["url"], url=article["url"],
feed=Feed.objects.get(pk=article["feed"]), feed=Feed.objects.get(pk=article["feed"]),
title=article["title"][0:100], title=article["title"],
original_image=article["image"][0:100], original_image=article["image"],
description=article["description"], description=article["description"],
date=date, date=date,
) )
) )
articles = Article.objects.bulk_create(articles) articles = Article.objects.bulk_create(articles)
download_remote_images() download_remote_images()
return articles return articles

View File

@ -5,6 +5,9 @@ from lxml import etree
from lxml.html.clean import Cleaner from lxml.html.clean import Cleaner
from io import StringIO, BytesIO from io import StringIO, BytesIO
from django.utils.html import escape
namespaces = {} namespaces = {}
urls = [ urls = [
"https://feeds.feedburner.com/projects-jl", "https://feeds.feedburner.com/projects-jl",
@ -20,18 +23,6 @@ def parse_content(content):
headers = { headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0" "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0"
} }
html_cleaner = Cleaner()
html_cleaner.javascript = True
html_cleaner.style = True
html_cleaner.remove_tags = [
"script",
"iframe",
"link",
"style",
"img",
"div",
]
# ~ html_cleaner.allow_tags = ['a', 'p', 'strong']
html_img_cleaner = Cleaner(allow_tags=["img"], remove_unknown_tags=False) html_img_cleaner = Cleaner(allow_tags=["img"], remove_unknown_tags=False)
html_img_cleaner.allow_tags = ["img"] html_img_cleaner.allow_tags = ["img"]
@ -40,17 +31,13 @@ def parse_content(content):
remove_blank_text=True, ns_clean=True, encoding="utf-8" remove_blank_text=True, ns_clean=True, encoding="utf-8"
) )
print("------------------") return lxml.etree.XML("<div>" + escape(content) + "</div>", xml_parser)
print(content)
dom = lxml.etree.XML("<div>" + content + "</div>", xml_parser)
return dom
def fetch_image_from_node_text(text): def fetch_image_from_node_text(text):
description = lxml.etree.parse(text, html_parser) description = lxml.etree.parse(text, html_parser)
for image in description.xpath(".//img"): for image in description.xpath(".//img"):
print('fetch image from node text') print("fetch image from node text")
return image.get("src") return image.get("src")
return None return None
@ -71,16 +58,14 @@ def fetch_node_text(node, name, default=u""):
def fetch_image(post, node, namespaces): def fetch_image(post, node, namespaces):
"""Try and get an image from an item in the feed, use various fall back methods""" """Try and get an image from an item in the feed, use various fall back methods"""
if hasattr(post, "media_thumbnail"): if hasattr(post, "media_thumbnail"):
print('media') print("media")
image = post.media_thumbnail image = post.media_thumbnail
print(image) print(image)
if image: if image:
return image[0].get("url") return image[0].get("url")
if hasattr(post, "content"): if hasattr(post, "content"):
print('content') print("content")
content = " ".join(c.value for c in post.content) content = " ".join(c.value for c in post.content)
image = fetch_image_from_node_text(content) image = fetch_image_from_node_text(content)
if image: if image:
@ -89,7 +74,7 @@ def fetch_image(post, node, namespaces):
# final attempt at getting an image from the item using description # final attempt at getting an image from the item using description
result = fetch_node_text(node, "description") result = fetch_node_text(node, "description")
if result: if result:
print('description') print("description")
image = fetch_image_from_node_text(result) image = fetch_image_from_node_text(result)
if image: if image:
return image return image
@ -99,33 +84,27 @@ def fetch_image(post, node, namespaces):
def fetch_feeds(feeds): def fetch_feeds(feeds):
articles = []
for feed in feeds: for feed in feeds:
url = feed.get('url') url = feed.get("url")
print(url)
parsed = feedparser.parse(url) parsed = feedparser.parse(url)
namespaces = {} namespaces = {}
if hasattr(parsed, "namespaces"): if hasattr(parsed, "namespaces"):
namespaces = parsed.namespaces namespaces = parsed.namespaces
feed_image = "" feed_image = ""
if hasattr(parsed.feed, "image"): if hasattr(parsed.feed, "image"):
feed_image = parsed.feed.image.get('href') feed_image = parsed.feed.image.get("href")
articles = []
for post in parsed.entries: for post in parsed.entries:
print(post.published)
print(feed_image)
root_node = parse_content(post.description) root_node = parse_content(post.description)
image = fetch_image(post, root_node, namespaces) #or feed_image image = fetch_image(post, root_node, namespaces) or feed_image
yield {
articles.append( "url": post.link,
{ "feed": feed.get("id"),
"url": post.link, "title": post.title,
"feed": feed.get('id'), "original_image": image,
"title": post.title, "description": post.description,
"original_image": image, "date": post.published_parsed,
"description": post.description, "image": image,
"date": post.published_parsed, }
"image": feed_image,
}
)
print(articles[-1])
return articles return articles

View File

@ -8,10 +8,10 @@ django==2.1.1
django-dynamic-filenames==1.1.3 django-dynamic-filenames==1.1.3
# Configuration # Configuration
django-environ==0.4.5 django-environ==0.4.5
whitenoise==4.0 whitenoise==4.1
# Static and Media Storage # Static and Media Storage
# ------------------------------------------------ # ------------------------------------------------
boto3==1.9.1 boto3==1.9.5
django-storages==1.7.1 django-storages==1.7.1
# django-storages-redux==1.3.2 # django-storages-redux==1.3.2
@ -55,7 +55,7 @@ django-compressor==2.2
#fix for use with s3 buckets merged in master, so next release we can remove this #fix for use with s3 buckets merged in master, so next release we can remove this
#django-sass-processor==0.5.7 #django-sass-processor==0.5.7
git+https://github.com/jrief/django-sass-processor.git git+https://github.com/jrief/django-sass-processor.git
libsass==0.14.5 libsass==0.15.0
lxml==4.2.5 lxml==4.2.5
# WSGI Handler # WSGI Handler
@ -72,9 +72,6 @@ gocardless_pro==1.8.0
braintree==3.48.0 braintree==3.48.0
django-autofixture==0.12.1 django-autofixture==0.12.1
git+https://github.com/olymk2/scaffold.git
#git+git://github.com/olymk2/django-wiki.git
git+git://github.com/django-wiki/django-wiki.git git+git://github.com/django-wiki/django-wiki.git
djangorestframework==3.8.2 djangorestframework==3.8.2
@ -83,8 +80,7 @@ django-filter==2.0.0
coreapi==2.3.3 coreapi==2.3.3
# api libraries end # api libraries end
#martor==1.3.2 martor==1.3.3
git+git://github.com/olymk2/django-markdown-editor.git
django-spirit==0.6.1 django-spirit==0.6.1
django-djconfig==0.8.0 django-djconfig==0.8.0
@ -103,4 +99,4 @@ python-magic==0.4.15
ldap3==2.5.1 ldap3==2.5.1
bcrypt==3.1.4 bcrypt==3.1.4
python-twitter==3.4.2 python-twitter==3.4.2
feedparser feedparser==5.2.1