Update feed parser to us different library
This commit is contained in:
parent
aecd76b567
commit
d4357e3a75
|
@ -1,11 +1,10 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from django import template
|
||||
from mhackspace.feeds.models import Feed
|
||||
from scaffold.readers.rss_reader import feed_reader
|
||||
from django.conf import settings
|
||||
|
||||
register = template.Library()
|
||||
|
||||
@register.inclusion_tag('partials/recapture.html')
|
||||
|
||||
@register.inclusion_tag("partials/recapture.html")
|
||||
def google_capture():
|
||||
return settings.CAPTCHA
|
||||
|
|
|
@ -1,53 +1,34 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import logging
|
||||
import feedparser
|
||||
|
||||
from time import mktime
|
||||
from datetime import datetime
|
||||
from urllib.request import urlretrieve
|
||||
from django.core.files import File
|
||||
from django.utils.timezone import make_aware
|
||||
from django.utils import timezone
|
||||
from stdimage.utils import render_variations
|
||||
from mhackspace.feeds.reader import fetch_feeds
|
||||
|
||||
# from scaffold.readers.rss_reader import feed_reader
|
||||
|
||||
from mhackspace.feeds.models import Feed, Article, image_variations
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def feed_reader(feeds):
|
||||
for feed in feeds:
|
||||
print(feed)
|
||||
yield feedparser.parse(feed["url"])
|
||||
|
||||
|
||||
def import_feeds(feed=False):
|
||||
remove_old_articles()
|
||||
|
||||
print([f.get("url") for f in get_active_feeds(feed)])
|
||||
rss_articles = fetch_feeds(get_active_feeds(feed))
|
||||
|
||||
articles = []
|
||||
for article in rss_articles:
|
||||
for article in fetch_feeds(get_active_feeds(feed)):
|
||||
date = datetime.fromtimestamp(mktime(article["date"]))
|
||||
print(article["title"])
|
||||
print(article["image"])
|
||||
print('#############')
|
||||
articles.append(
|
||||
Article(
|
||||
url=article["url"],
|
||||
feed=Feed.objects.get(pk=article["feed"]),
|
||||
title=article["title"][0:100],
|
||||
original_image=article["image"][0:100],
|
||||
title=article["title"],
|
||||
original_image=article["image"],
|
||||
description=article["description"],
|
||||
date=date,
|
||||
)
|
||||
)
|
||||
|
||||
articles = Article.objects.bulk_create(articles)
|
||||
download_remote_images()
|
||||
return articles
|
||||
|
|
|
@ -5,6 +5,9 @@ from lxml import etree
|
|||
from lxml.html.clean import Cleaner
|
||||
from io import StringIO, BytesIO
|
||||
|
||||
from django.utils.html import escape
|
||||
|
||||
|
||||
namespaces = {}
|
||||
urls = [
|
||||
"https://feeds.feedburner.com/projects-jl",
|
||||
|
@ -20,18 +23,6 @@ def parse_content(content):
|
|||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0"
|
||||
}
|
||||
html_cleaner = Cleaner()
|
||||
html_cleaner.javascript = True
|
||||
html_cleaner.style = True
|
||||
html_cleaner.remove_tags = [
|
||||
"script",
|
||||
"iframe",
|
||||
"link",
|
||||
"style",
|
||||
"img",
|
||||
"div",
|
||||
]
|
||||
# ~ html_cleaner.allow_tags = ['a', 'p', 'strong']
|
||||
|
||||
html_img_cleaner = Cleaner(allow_tags=["img"], remove_unknown_tags=False)
|
||||
html_img_cleaner.allow_tags = ["img"]
|
||||
|
@ -40,17 +31,13 @@ def parse_content(content):
|
|||
remove_blank_text=True, ns_clean=True, encoding="utf-8"
|
||||
)
|
||||
|
||||
print("------------------")
|
||||
print(content)
|
||||
|
||||
dom = lxml.etree.XML("<div>" + content + "</div>", xml_parser)
|
||||
return dom
|
||||
return lxml.etree.XML("<div>" + escape(content) + "</div>", xml_parser)
|
||||
|
||||
|
||||
def fetch_image_from_node_text(text):
|
||||
description = lxml.etree.parse(text, html_parser)
|
||||
for image in description.xpath(".//img"):
|
||||
print('fetch image from node text')
|
||||
print("fetch image from node text")
|
||||
return image.get("src")
|
||||
return None
|
||||
|
||||
|
@ -71,16 +58,14 @@ def fetch_node_text(node, name, default=u""):
|
|||
def fetch_image(post, node, namespaces):
|
||||
"""Try and get an image from an item in the feed, use various fall back methods"""
|
||||
if hasattr(post, "media_thumbnail"):
|
||||
print('media')
|
||||
|
||||
print("media")
|
||||
image = post.media_thumbnail
|
||||
print(image)
|
||||
|
||||
if image:
|
||||
return image[0].get("url")
|
||||
|
||||
if hasattr(post, "content"):
|
||||
print('content')
|
||||
print("content")
|
||||
content = " ".join(c.value for c in post.content)
|
||||
image = fetch_image_from_node_text(content)
|
||||
if image:
|
||||
|
@ -89,7 +74,7 @@ def fetch_image(post, node, namespaces):
|
|||
# final attempt at getting an image from the item using description
|
||||
result = fetch_node_text(node, "description")
|
||||
if result:
|
||||
print('description')
|
||||
print("description")
|
||||
image = fetch_image_from_node_text(result)
|
||||
if image:
|
||||
return image
|
||||
|
@ -99,33 +84,27 @@ def fetch_image(post, node, namespaces):
|
|||
|
||||
|
||||
def fetch_feeds(feeds):
|
||||
articles = []
|
||||
|
||||
for feed in feeds:
|
||||
url = feed.get('url')
|
||||
print(url)
|
||||
url = feed.get("url")
|
||||
parsed = feedparser.parse(url)
|
||||
namespaces = {}
|
||||
if hasattr(parsed, "namespaces"):
|
||||
namespaces = parsed.namespaces
|
||||
feed_image = ""
|
||||
if hasattr(parsed.feed, "image"):
|
||||
feed_image = parsed.feed.image.get('href')
|
||||
articles = []
|
||||
feed_image = parsed.feed.image.get("href")
|
||||
for post in parsed.entries:
|
||||
print(post.published)
|
||||
print(feed_image)
|
||||
root_node = parse_content(post.description)
|
||||
image = fetch_image(post, root_node, namespaces) #or feed_image
|
||||
|
||||
articles.append(
|
||||
{
|
||||
image = fetch_image(post, root_node, namespaces) or feed_image
|
||||
yield {
|
||||
"url": post.link,
|
||||
"feed": feed.get('id'),
|
||||
"feed": feed.get("id"),
|
||||
"title": post.title,
|
||||
"original_image": image,
|
||||
"description": post.description,
|
||||
"date": post.published_parsed,
|
||||
"image": feed_image,
|
||||
"image": image,
|
||||
}
|
||||
)
|
||||
print(articles[-1])
|
||||
return articles
|
||||
|
|
|
@ -8,10 +8,10 @@ django==2.1.1
|
|||
django-dynamic-filenames==1.1.3
|
||||
# Configuration
|
||||
django-environ==0.4.5
|
||||
whitenoise==4.0
|
||||
whitenoise==4.1
|
||||
# Static and Media Storage
|
||||
# ------------------------------------------------
|
||||
boto3==1.9.1
|
||||
boto3==1.9.5
|
||||
django-storages==1.7.1
|
||||
# django-storages-redux==1.3.2
|
||||
|
||||
|
@ -55,7 +55,7 @@ django-compressor==2.2
|
|||
#fix for use with s3 buckets merged in master, so next release we can remove this
|
||||
#django-sass-processor==0.5.7
|
||||
git+https://github.com/jrief/django-sass-processor.git
|
||||
libsass==0.14.5
|
||||
libsass==0.15.0
|
||||
lxml==4.2.5
|
||||
|
||||
# WSGI Handler
|
||||
|
@ -72,9 +72,6 @@ gocardless_pro==1.8.0
|
|||
braintree==3.48.0
|
||||
|
||||
django-autofixture==0.12.1
|
||||
|
||||
git+https://github.com/olymk2/scaffold.git
|
||||
#git+git://github.com/olymk2/django-wiki.git
|
||||
git+git://github.com/django-wiki/django-wiki.git
|
||||
|
||||
djangorestframework==3.8.2
|
||||
|
@ -83,8 +80,7 @@ django-filter==2.0.0
|
|||
coreapi==2.3.3
|
||||
# api libraries end
|
||||
|
||||
#martor==1.3.2
|
||||
git+git://github.com/olymk2/django-markdown-editor.git
|
||||
martor==1.3.3
|
||||
|
||||
django-spirit==0.6.1
|
||||
django-djconfig==0.8.0
|
||||
|
@ -103,4 +99,4 @@ python-magic==0.4.15
|
|||
ldap3==2.5.1
|
||||
bcrypt==3.1.4
|
||||
python-twitter==3.4.2
|
||||
feedparser
|
||||
feedparser==5.2.1
|
||||
|
|
Loading…
Reference in New Issue