Fix pull image from description fallback using string instead of stream
This commit is contained in:
parent
9d0588e6a1
commit
d8c853ee38
|
@ -1,9 +1,7 @@
|
||||||
import lxml
|
import lxml
|
||||||
import feedparser
|
import feedparser
|
||||||
from operator import itemgetter
|
from io import StringIO
|
||||||
from lxml import etree
|
|
||||||
from lxml.html.clean import Cleaner
|
from lxml.html.clean import Cleaner
|
||||||
from io import StringIO, BytesIO
|
|
||||||
|
|
||||||
from django.utils.html import escape
|
from django.utils.html import escape
|
||||||
|
|
||||||
|
@ -20,10 +18,6 @@ html_parser = lxml.etree.HTMLParser()
|
||||||
|
|
||||||
|
|
||||||
def parse_content(content):
|
def parse_content(content):
|
||||||
headers = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0"
|
|
||||||
}
|
|
||||||
|
|
||||||
html_img_cleaner = Cleaner(allow_tags=["img"], remove_unknown_tags=False)
|
html_img_cleaner = Cleaner(allow_tags=["img"], remove_unknown_tags=False)
|
||||||
html_img_cleaner.allow_tags = ["img"]
|
html_img_cleaner.allow_tags = ["img"]
|
||||||
|
|
||||||
|
@ -35,9 +29,8 @@ def parse_content(content):
|
||||||
|
|
||||||
|
|
||||||
def fetch_image_from_node_text(text):
|
def fetch_image_from_node_text(text):
|
||||||
description = lxml.etree.parse(text, html_parser)
|
description = lxml.etree.parse(StringIO(text), html_parser)
|
||||||
for image in description.xpath(".//img"):
|
for image in description.xpath(".//img"):
|
||||||
print("fetch image from node text")
|
|
||||||
return image.get("src")
|
return image.get("src")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -58,14 +51,11 @@ def fetch_node_text(node, name, default=u""):
|
||||||
def fetch_image(post, node, namespaces):
|
def fetch_image(post, node, namespaces):
|
||||||
"""Try and get an image from an item in the feed, use various fall back methods"""
|
"""Try and get an image from an item in the feed, use various fall back methods"""
|
||||||
if hasattr(post, "media_thumbnail"):
|
if hasattr(post, "media_thumbnail"):
|
||||||
print("media")
|
|
||||||
image = post.media_thumbnail
|
image = post.media_thumbnail
|
||||||
print(image)
|
|
||||||
if image:
|
if image:
|
||||||
return image[0].get("url")
|
return image[0].get("url")
|
||||||
|
|
||||||
if hasattr(post, "content"):
|
if hasattr(post, "content"):
|
||||||
print("content")
|
|
||||||
content = " ".join(c.value for c in post.content)
|
content = " ".join(c.value for c in post.content)
|
||||||
image = fetch_image_from_node_text(content)
|
image = fetch_image_from_node_text(content)
|
||||||
if image:
|
if image:
|
||||||
|
@ -74,7 +64,6 @@ def fetch_image(post, node, namespaces):
|
||||||
# final attempt at getting an image from the item using description
|
# final attempt at getting an image from the item using description
|
||||||
result = fetch_node_text(node, "description")
|
result = fetch_node_text(node, "description")
|
||||||
if result:
|
if result:
|
||||||
print("description")
|
|
||||||
image = fetch_image_from_node_text(result)
|
image = fetch_image_from_node_text(result)
|
||||||
if image:
|
if image:
|
||||||
return image
|
return image
|
||||||
|
|
Loading…
Reference in New Issue