Update feed parser to us different library
This commit is contained in:
parent
aecd76b567
commit
d4357e3a75
|
@ -1,11 +1,10 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from django import template
|
from django import template
|
||||||
from mhackspace.feeds.models import Feed
|
|
||||||
from scaffold.readers.rss_reader import feed_reader
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
register = template.Library()
|
register = template.Library()
|
||||||
|
|
||||||
@register.inclusion_tag('partials/recapture.html')
|
|
||||||
|
@register.inclusion_tag("partials/recapture.html")
|
||||||
def google_capture():
|
def google_capture():
|
||||||
return settings.CAPTCHA
|
return settings.CAPTCHA
|
||||||
|
|
|
@ -1,53 +1,34 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
import feedparser
|
|
||||||
|
|
||||||
from time import mktime
|
from time import mktime
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from urllib.request import urlretrieve
|
from urllib.request import urlretrieve
|
||||||
from django.core.files import File
|
from django.core.files import File
|
||||||
from django.utils.timezone import make_aware
|
|
||||||
from django.utils import timezone
|
|
||||||
from stdimage.utils import render_variations
|
from stdimage.utils import render_variations
|
||||||
from mhackspace.feeds.reader import fetch_feeds
|
from mhackspace.feeds.reader import fetch_feeds
|
||||||
|
|
||||||
# from scaffold.readers.rss_reader import feed_reader
|
|
||||||
|
|
||||||
from mhackspace.feeds.models import Feed, Article, image_variations
|
from mhackspace.feeds.models import Feed, Article, image_variations
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def feed_reader(feeds):
|
|
||||||
for feed in feeds:
|
|
||||||
print(feed)
|
|
||||||
yield feedparser.parse(feed["url"])
|
|
||||||
|
|
||||||
|
|
||||||
def import_feeds(feed=False):
|
def import_feeds(feed=False):
|
||||||
remove_old_articles()
|
remove_old_articles()
|
||||||
|
|
||||||
print([f.get("url") for f in get_active_feeds(feed)])
|
|
||||||
rss_articles = fetch_feeds(get_active_feeds(feed))
|
|
||||||
|
|
||||||
articles = []
|
articles = []
|
||||||
for article in rss_articles:
|
for article in fetch_feeds(get_active_feeds(feed)):
|
||||||
date = datetime.fromtimestamp(mktime(article["date"]))
|
date = datetime.fromtimestamp(mktime(article["date"]))
|
||||||
print(article["title"])
|
|
||||||
print(article["image"])
|
|
||||||
print('#############')
|
|
||||||
articles.append(
|
articles.append(
|
||||||
Article(
|
Article(
|
||||||
url=article["url"],
|
url=article["url"],
|
||||||
feed=Feed.objects.get(pk=article["feed"]),
|
feed=Feed.objects.get(pk=article["feed"]),
|
||||||
title=article["title"][0:100],
|
title=article["title"],
|
||||||
original_image=article["image"][0:100],
|
original_image=article["image"],
|
||||||
description=article["description"],
|
description=article["description"],
|
||||||
date=date,
|
date=date,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
articles = Article.objects.bulk_create(articles)
|
articles = Article.objects.bulk_create(articles)
|
||||||
download_remote_images()
|
download_remote_images()
|
||||||
return articles
|
return articles
|
||||||
|
|
|
@ -5,6 +5,9 @@ from lxml import etree
|
||||||
from lxml.html.clean import Cleaner
|
from lxml.html.clean import Cleaner
|
||||||
from io import StringIO, BytesIO
|
from io import StringIO, BytesIO
|
||||||
|
|
||||||
|
from django.utils.html import escape
|
||||||
|
|
||||||
|
|
||||||
namespaces = {}
|
namespaces = {}
|
||||||
urls = [
|
urls = [
|
||||||
"https://feeds.feedburner.com/projects-jl",
|
"https://feeds.feedburner.com/projects-jl",
|
||||||
|
@ -20,18 +23,6 @@ def parse_content(content):
|
||||||
headers = {
|
headers = {
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0"
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0"
|
||||||
}
|
}
|
||||||
html_cleaner = Cleaner()
|
|
||||||
html_cleaner.javascript = True
|
|
||||||
html_cleaner.style = True
|
|
||||||
html_cleaner.remove_tags = [
|
|
||||||
"script",
|
|
||||||
"iframe",
|
|
||||||
"link",
|
|
||||||
"style",
|
|
||||||
"img",
|
|
||||||
"div",
|
|
||||||
]
|
|
||||||
# ~ html_cleaner.allow_tags = ['a', 'p', 'strong']
|
|
||||||
|
|
||||||
html_img_cleaner = Cleaner(allow_tags=["img"], remove_unknown_tags=False)
|
html_img_cleaner = Cleaner(allow_tags=["img"], remove_unknown_tags=False)
|
||||||
html_img_cleaner.allow_tags = ["img"]
|
html_img_cleaner.allow_tags = ["img"]
|
||||||
|
@ -40,17 +31,13 @@ def parse_content(content):
|
||||||
remove_blank_text=True, ns_clean=True, encoding="utf-8"
|
remove_blank_text=True, ns_clean=True, encoding="utf-8"
|
||||||
)
|
)
|
||||||
|
|
||||||
print("------------------")
|
return lxml.etree.XML("<div>" + escape(content) + "</div>", xml_parser)
|
||||||
print(content)
|
|
||||||
|
|
||||||
dom = lxml.etree.XML("<div>" + content + "</div>", xml_parser)
|
|
||||||
return dom
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_image_from_node_text(text):
|
def fetch_image_from_node_text(text):
|
||||||
description = lxml.etree.parse(text, html_parser)
|
description = lxml.etree.parse(text, html_parser)
|
||||||
for image in description.xpath(".//img"):
|
for image in description.xpath(".//img"):
|
||||||
print('fetch image from node text')
|
print("fetch image from node text")
|
||||||
return image.get("src")
|
return image.get("src")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -71,16 +58,14 @@ def fetch_node_text(node, name, default=u""):
|
||||||
def fetch_image(post, node, namespaces):
|
def fetch_image(post, node, namespaces):
|
||||||
"""Try and get an image from an item in the feed, use various fall back methods"""
|
"""Try and get an image from an item in the feed, use various fall back methods"""
|
||||||
if hasattr(post, "media_thumbnail"):
|
if hasattr(post, "media_thumbnail"):
|
||||||
print('media')
|
print("media")
|
||||||
|
|
||||||
image = post.media_thumbnail
|
image = post.media_thumbnail
|
||||||
print(image)
|
print(image)
|
||||||
|
|
||||||
if image:
|
if image:
|
||||||
return image[0].get("url")
|
return image[0].get("url")
|
||||||
|
|
||||||
if hasattr(post, "content"):
|
if hasattr(post, "content"):
|
||||||
print('content')
|
print("content")
|
||||||
content = " ".join(c.value for c in post.content)
|
content = " ".join(c.value for c in post.content)
|
||||||
image = fetch_image_from_node_text(content)
|
image = fetch_image_from_node_text(content)
|
||||||
if image:
|
if image:
|
||||||
|
@ -89,7 +74,7 @@ def fetch_image(post, node, namespaces):
|
||||||
# final attempt at getting an image from the item using description
|
# final attempt at getting an image from the item using description
|
||||||
result = fetch_node_text(node, "description")
|
result = fetch_node_text(node, "description")
|
||||||
if result:
|
if result:
|
||||||
print('description')
|
print("description")
|
||||||
image = fetch_image_from_node_text(result)
|
image = fetch_image_from_node_text(result)
|
||||||
if image:
|
if image:
|
||||||
return image
|
return image
|
||||||
|
@ -99,33 +84,27 @@ def fetch_image(post, node, namespaces):
|
||||||
|
|
||||||
|
|
||||||
def fetch_feeds(feeds):
|
def fetch_feeds(feeds):
|
||||||
|
articles = []
|
||||||
|
|
||||||
for feed in feeds:
|
for feed in feeds:
|
||||||
url = feed.get('url')
|
url = feed.get("url")
|
||||||
print(url)
|
|
||||||
parsed = feedparser.parse(url)
|
parsed = feedparser.parse(url)
|
||||||
namespaces = {}
|
namespaces = {}
|
||||||
if hasattr(parsed, "namespaces"):
|
if hasattr(parsed, "namespaces"):
|
||||||
namespaces = parsed.namespaces
|
namespaces = parsed.namespaces
|
||||||
feed_image = ""
|
feed_image = ""
|
||||||
if hasattr(parsed.feed, "image"):
|
if hasattr(parsed.feed, "image"):
|
||||||
feed_image = parsed.feed.image.get('href')
|
feed_image = parsed.feed.image.get("href")
|
||||||
articles = []
|
|
||||||
for post in parsed.entries:
|
for post in parsed.entries:
|
||||||
print(post.published)
|
|
||||||
print(feed_image)
|
|
||||||
root_node = parse_content(post.description)
|
root_node = parse_content(post.description)
|
||||||
image = fetch_image(post, root_node, namespaces) #or feed_image
|
image = fetch_image(post, root_node, namespaces) or feed_image
|
||||||
|
yield {
|
||||||
articles.append(
|
|
||||||
{
|
|
||||||
"url": post.link,
|
"url": post.link,
|
||||||
"feed": feed.get('id'),
|
"feed": feed.get("id"),
|
||||||
"title": post.title,
|
"title": post.title,
|
||||||
"original_image": image,
|
"original_image": image,
|
||||||
"description": post.description,
|
"description": post.description,
|
||||||
"date": post.published_parsed,
|
"date": post.published_parsed,
|
||||||
"image": feed_image,
|
"image": image,
|
||||||
}
|
}
|
||||||
)
|
|
||||||
print(articles[-1])
|
|
||||||
return articles
|
return articles
|
||||||
|
|
|
@ -8,10 +8,10 @@ django==2.1.1
|
||||||
django-dynamic-filenames==1.1.3
|
django-dynamic-filenames==1.1.3
|
||||||
# Configuration
|
# Configuration
|
||||||
django-environ==0.4.5
|
django-environ==0.4.5
|
||||||
whitenoise==4.0
|
whitenoise==4.1
|
||||||
# Static and Media Storage
|
# Static and Media Storage
|
||||||
# ------------------------------------------------
|
# ------------------------------------------------
|
||||||
boto3==1.9.1
|
boto3==1.9.5
|
||||||
django-storages==1.7.1
|
django-storages==1.7.1
|
||||||
# django-storages-redux==1.3.2
|
# django-storages-redux==1.3.2
|
||||||
|
|
||||||
|
@ -55,7 +55,7 @@ django-compressor==2.2
|
||||||
#fix for use with s3 buckets merged in master, so next release we can remove this
|
#fix for use with s3 buckets merged in master, so next release we can remove this
|
||||||
#django-sass-processor==0.5.7
|
#django-sass-processor==0.5.7
|
||||||
git+https://github.com/jrief/django-sass-processor.git
|
git+https://github.com/jrief/django-sass-processor.git
|
||||||
libsass==0.14.5
|
libsass==0.15.0
|
||||||
lxml==4.2.5
|
lxml==4.2.5
|
||||||
|
|
||||||
# WSGI Handler
|
# WSGI Handler
|
||||||
|
@ -72,9 +72,6 @@ gocardless_pro==1.8.0
|
||||||
braintree==3.48.0
|
braintree==3.48.0
|
||||||
|
|
||||||
django-autofixture==0.12.1
|
django-autofixture==0.12.1
|
||||||
|
|
||||||
git+https://github.com/olymk2/scaffold.git
|
|
||||||
#git+git://github.com/olymk2/django-wiki.git
|
|
||||||
git+git://github.com/django-wiki/django-wiki.git
|
git+git://github.com/django-wiki/django-wiki.git
|
||||||
|
|
||||||
djangorestframework==3.8.2
|
djangorestframework==3.8.2
|
||||||
|
@ -83,8 +80,7 @@ django-filter==2.0.0
|
||||||
coreapi==2.3.3
|
coreapi==2.3.3
|
||||||
# api libraries end
|
# api libraries end
|
||||||
|
|
||||||
#martor==1.3.2
|
martor==1.3.3
|
||||||
git+git://github.com/olymk2/django-markdown-editor.git
|
|
||||||
|
|
||||||
django-spirit==0.6.1
|
django-spirit==0.6.1
|
||||||
django-djconfig==0.8.0
|
django-djconfig==0.8.0
|
||||||
|
@ -103,4 +99,4 @@ python-magic==0.4.15
|
||||||
ldap3==2.5.1
|
ldap3==2.5.1
|
||||||
bcrypt==3.1.4
|
bcrypt==3.1.4
|
||||||
python-twitter==3.4.2
|
python-twitter==3.4.2
|
||||||
feedparser
|
feedparser==5.2.1
|
||||||
|
|
Loading…
Reference in New Issue