Update feed parser to us different library

This commit is contained in:
Oliver Marks 2018-09-18 20:11:45 +01:00
parent aecd76b567
commit d4357e3a75
4 changed files with 33 additions and 78 deletions

View File

@ -1,11 +1,10 @@
# -*- coding: utf-8 -*-
from django import template
from mhackspace.feeds.models import Feed
from scaffold.readers.rss_reader import feed_reader
from django.conf import settings
register = template.Library()
@register.inclusion_tag('partials/recapture.html')
@register.inclusion_tag("partials/recapture.html")
def google_capture():
return settings.CAPTCHA
return settings.CAPTCHA

View File

@ -1,53 +1,34 @@
# -*- coding: utf-8 -*-
import os
import logging
import feedparser
from time import mktime
from datetime import datetime
from urllib.request import urlretrieve
from django.core.files import File
from django.utils.timezone import make_aware
from django.utils import timezone
from stdimage.utils import render_variations
from mhackspace.feeds.reader import fetch_feeds
# from scaffold.readers.rss_reader import feed_reader
from mhackspace.feeds.models import Feed, Article, image_variations
logger = logging.getLogger(__name__)
def feed_reader(feeds):
for feed in feeds:
print(feed)
yield feedparser.parse(feed["url"])
def import_feeds(feed=False):
remove_old_articles()
print([f.get("url") for f in get_active_feeds(feed)])
rss_articles = fetch_feeds(get_active_feeds(feed))
articles = []
for article in rss_articles:
for article in fetch_feeds(get_active_feeds(feed)):
date = datetime.fromtimestamp(mktime(article["date"]))
print(article["title"])
print(article["image"])
print('#############')
articles.append(
Article(
url=article["url"],
feed=Feed.objects.get(pk=article["feed"]),
title=article["title"][0:100],
original_image=article["image"][0:100],
title=article["title"],
original_image=article["image"],
description=article["description"],
date=date,
)
)
articles = Article.objects.bulk_create(articles)
download_remote_images()
return articles

View File

@ -5,6 +5,9 @@ from lxml import etree
from lxml.html.clean import Cleaner
from io import StringIO, BytesIO
from django.utils.html import escape
namespaces = {}
urls = [
"https://feeds.feedburner.com/projects-jl",
@ -20,18 +23,6 @@ def parse_content(content):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0"
}
html_cleaner = Cleaner()
html_cleaner.javascript = True
html_cleaner.style = True
html_cleaner.remove_tags = [
"script",
"iframe",
"link",
"style",
"img",
"div",
]
# ~ html_cleaner.allow_tags = ['a', 'p', 'strong']
html_img_cleaner = Cleaner(allow_tags=["img"], remove_unknown_tags=False)
html_img_cleaner.allow_tags = ["img"]
@ -40,17 +31,13 @@ def parse_content(content):
remove_blank_text=True, ns_clean=True, encoding="utf-8"
)
print("------------------")
print(content)
dom = lxml.etree.XML("<div>" + content + "</div>", xml_parser)
return dom
return lxml.etree.XML("<div>" + escape(content) + "</div>", xml_parser)
def fetch_image_from_node_text(text):
description = lxml.etree.parse(text, html_parser)
for image in description.xpath(".//img"):
print('fetch image from node text')
print("fetch image from node text")
return image.get("src")
return None
@ -71,16 +58,14 @@ def fetch_node_text(node, name, default=u""):
def fetch_image(post, node, namespaces):
"""Try and get an image from an item in the feed, use various fall back methods"""
if hasattr(post, "media_thumbnail"):
print('media')
print("media")
image = post.media_thumbnail
print(image)
if image:
return image[0].get("url")
if hasattr(post, "content"):
print('content')
print("content")
content = " ".join(c.value for c in post.content)
image = fetch_image_from_node_text(content)
if image:
@ -89,7 +74,7 @@ def fetch_image(post, node, namespaces):
# final attempt at getting an image from the item using description
result = fetch_node_text(node, "description")
if result:
print('description')
print("description")
image = fetch_image_from_node_text(result)
if image:
return image
@ -99,33 +84,27 @@ def fetch_image(post, node, namespaces):
def fetch_feeds(feeds):
articles = []
for feed in feeds:
url = feed.get('url')
print(url)
url = feed.get("url")
parsed = feedparser.parse(url)
namespaces = {}
if hasattr(parsed, "namespaces"):
namespaces = parsed.namespaces
feed_image = ""
if hasattr(parsed.feed, "image"):
feed_image = parsed.feed.image.get('href')
articles = []
feed_image = parsed.feed.image.get("href")
for post in parsed.entries:
print(post.published)
print(feed_image)
root_node = parse_content(post.description)
image = fetch_image(post, root_node, namespaces) #or feed_image
articles.append(
{
"url": post.link,
"feed": feed.get('id'),
"title": post.title,
"original_image": image,
"description": post.description,
"date": post.published_parsed,
"image": feed_image,
}
)
print(articles[-1])
image = fetch_image(post, root_node, namespaces) or feed_image
yield {
"url": post.link,
"feed": feed.get("id"),
"title": post.title,
"original_image": image,
"description": post.description,
"date": post.published_parsed,
"image": image,
}
return articles

View File

@ -8,10 +8,10 @@ django==2.1.1
django-dynamic-filenames==1.1.3
# Configuration
django-environ==0.4.5
whitenoise==4.0
whitenoise==4.1
# Static and Media Storage
# ------------------------------------------------
boto3==1.9.1
boto3==1.9.5
django-storages==1.7.1
# django-storages-redux==1.3.2
@ -55,7 +55,7 @@ django-compressor==2.2
#fix for use with s3 buckets merged in master, so next release we can remove this
#django-sass-processor==0.5.7
git+https://github.com/jrief/django-sass-processor.git
libsass==0.14.5
libsass==0.15.0
lxml==4.2.5
# WSGI Handler
@ -72,9 +72,6 @@ gocardless_pro==1.8.0
braintree==3.48.0
django-autofixture==0.12.1
git+https://github.com/olymk2/scaffold.git
#git+git://github.com/olymk2/django-wiki.git
git+git://github.com/django-wiki/django-wiki.git
djangorestframework==3.8.2
@ -83,8 +80,7 @@ django-filter==2.0.0
coreapi==2.3.3
# api libraries end
#martor==1.3.2
git+git://github.com/olymk2/django-markdown-editor.git
martor==1.3.3
django-spirit==0.6.1
django-djconfig==0.8.0
@ -103,4 +99,4 @@ python-magic==0.4.15
ldap3==2.5.1
bcrypt==3.1.4
python-twitter==3.4.2
feedparser
feedparser==5.2.1