Working on improved image parsing for captions

This commit is contained in:
Oly 2018-11-27 14:19:25 +00:00
parent ac899b8ada
commit 17994c2eb7
4 changed files with 108 additions and 16 deletions

34
eorg/helper.py Normal file
View File

@ -0,0 +1,34 @@
from eorg import tokens
from eorg.tokens import Token
from eorg.const import ESCAPE, image_extensions
def parse_img_or_link(char, step):
if char != "[":
return char, None
char = next(step, None)
if char != "[":
return char, None
char = next(step, None)
path = ""
while char not in ["]"] + ESCAPE:
path += char
char = next(step, None)
char = next(step, None)
alt = ""
if char == "[":
char = next(step, None)
while char not in ["]"] + ESCAPE:
alt += char
char = next(step, None)
char = next(step, None)
if path.endswith(image_extensions):
return "", Token(tokens.IMAGE, [path, alt])
return "", Token(tokens.LINK, [path, alt])

View File

@ -2,6 +2,7 @@ import re
from eorg import tokens from eorg import tokens
from eorg.tokens import Token from eorg.tokens import Token
from eorg.const import TOKENS, METADATA, ESCAPE, image_extensions from eorg.const import TOKENS, METADATA, ESCAPE, image_extensions
from eorg.helper import parse_img_or_link
class Document: class Document:
@ -18,16 +19,20 @@ class Document:
if not idx: if not idx:
if default is not None: if default is not None:
return default return default
raise AttributeError( raise AttributeError(
f"Attribute of {name} does not exist in document" f"Attribute of {name} does not exist in document"
) )
if len(idx) == 1: if len(idx) == 1:
return self.doc[idx[0]].value return self.doc[idx[0]].value
return [self.doc[v].value for v in idx] return [self.doc[v].value for v in idx]
def token(self): def token(self):
if self.doc: if self.doc:
return self.doc[-1].token return self.doc[-1].token
return "" return ""
def update(self, value): def update(self, value):
@ -37,13 +42,16 @@ class Document:
self.pos = 0 self.pos = 0
for item in self.doc: for item in self.doc:
yield item yield item
self.pos += 1 self.pos += 1
def previous(self, match): def previous(self, match):
if self.pos is 0: if self.pos is 0:
return None return None
if self.doc[self.pos - 1].token != match: if self.doc[self.pos - 1].token != match:
return None return None
return self.doc[self.pos - 1] return self.doc[self.pos - 1]
def filter(self, value): def filter(self, value):
@ -56,12 +64,14 @@ class Document:
for item in self.doc: for item in self.doc:
if item.token in METADATA: if item.token in METADATA:
continue continue
yield item yield item
def images(self): def images(self):
for item in self.__iter__(): for item in self.__iter__():
if item.token == tokens.IMAGE: if item.token == tokens.IMAGE:
yield item yield item
if isinstance(item.value, list): if isinstance(item.value, list):
for token in item.value: for token in item.value:
if token.token == tokens.IMAGE: if token.token == tokens.IMAGE:
@ -89,6 +99,7 @@ def parsebody(text, rx):
match = re.search(rx, text) match = re.search(rx, text)
if match: if match:
return False, None return False, None
return rx, text + "\n" return rx, text + "\n"
@ -98,23 +109,29 @@ def parseline(text):
match = re.search(rx, text) match = re.search(rx, text)
if not match: if not match:
continue continue
value = text[match.end() :]
value = text[match.end():]
level = len(match.group(0)) level = len(match.group(0))
if count is True: if count is True:
attrs = {"depth": level} attrs = {"depth": level}
if key == tokens.META: if key == tokens.META:
return (block, Token(token=match.group(0)[s:e], value=value)) return (block, Token(token=match.group(0)[s:e], value=value))
if key == tokens.SOURCE: if key == tokens.SOURCE:
return block, Token(token=key, attrs=parse_attrs(value)) return block, Token(token=key, attrs=parse_attrs(value))
if key == tokens.TABLE: if key == tokens.TABLE:
return block, Token(token=key, value=text + "\n") return block, Token(token=key, value=text + "\n")
if key == tokens.BULLET: if key == tokens.BULLET:
return block, Token(token=key, value=text + "\n") return block, Token(token=key, value=text + "\n")
return block, Token(token=key, value=value, attrs=attrs) return block, Token(token=key, value=value, attrs=attrs)
text = text.strip() text = text.strip()
if text == "": if text == "":
return False, Token(token=tokens.BLANK, value=text) return False, Token(token=tokens.BLANK, value=text)
return False, Token(token=tokens.LIST, value=text + " ") return False, Token(token=tokens.LIST, value=text + " ")
@ -125,10 +142,12 @@ def parse_text(txt):
def img(char, step): def img(char, step):
if char != "[": if char != "[":
return char return char
char = next(step, None) char = next(step, None)
if char != "[": if char != "[":
return char return char
char = next(step, None) char = next(step, None)
path = "" path = ""
@ -155,6 +174,7 @@ def parse_text(txt):
def emphasis(char, step, end, tag): def emphasis(char, step, end, tag):
if not char or char != end: if not char or char != end:
return char return char
char = next(step, None) char = next(step, None)
r = "" r = ""
while char and char not in [end] + ESCAPE: while char and char not in [end] + ESCAPE:
@ -171,15 +191,21 @@ def parse_text(txt):
char = emphasis(char, step, "_", tokens.UNDERLINED) char = emphasis(char, step, "_", tokens.UNDERLINED)
char = emphasis(char, step, "=", tokens.VERBATIM) char = emphasis(char, step, "=", tokens.VERBATIM)
char = emphasis(char, step, "~", "PRE") char = emphasis(char, step, "~", "PRE")
char = img(char, step) #char = img(char, step)
char, token = parse_img_or_link(char, step)
if token:
tokenlist.append(token)
if not char: if not char:
continue continue
if len(tokenlist) == 0: if len(tokenlist) == 0:
tokenlist.append(Token(tokens.TEXT, char)) tokenlist.append(Token(tokens.TEXT, char))
continue continue
if tokenlist[-1].token != tokens.TEXT: if tokenlist[-1].token != tokens.TEXT:
tokenlist.append(Token(tokens.TEXT, char)) tokenlist.append(Token(tokens.TEXT, char))
continue continue
tokenlist[-1].value += char tokenlist[-1].value += char
return tokenlist return tokenlist
@ -194,11 +220,13 @@ def parse(stream):
if block: if block:
doc.update(token) doc.update(token)
continue continue
block, token = parseline(line) block, token = parseline(line)
if token: if token:
if doc.token() == tokens.LIST and token.token == tokens.LIST: if doc.token() == tokens.LIST and token.token == tokens.LIST:
doc.update(token.value) doc.update(token.value)
continue continue
doc.append(token) doc.append(token)
for item in doc.filter(tokens.LIST): for item in doc.filter(tokens.LIST):

View File

@ -49,12 +49,30 @@ def test_image():
text = "[[../../../images/opengl/point-sprite-shader.png]]" text = "[[../../../images/opengl/point-sprite-shader.png]]"
expected = [ expected = [
Token(tokens.IMAGE, ["../../../images/opengl/point-sprite-shader.png", ""]), Token(
tokens.IMAGE,
["../../../images/opengl/point-sprite-shader.png", ""],
)
] ]
result = parse_text(text) result = parse(text).doc
assert result[0].value == expected[0].value assert result[0].value == expected[0].value
def test_image_with_caption():
text = StringIO("""#+CAPTION: Test Image
text [[../../test.jpg][test]]""")
expected = [
Token(tokens.CAPTION, " Test Image"),
Token(tokens.LIST, [Token(tokens.IMAGE, ["../../test.jpg", "test"])]),
]
result = parse(text).doc
assert len(result) == 2
assert result[0].token == expected[0].token
assert result[0].value == expected[0].value
assert result[1].token == expected[1].token
assert result[1].value == expected[1].value
def test_link(): def test_link():
text = "parse link [[../../test.html][test]] after link" text = "parse link [[../../test.html][test]] after link"
expected = [ expected = [
@ -108,7 +126,7 @@ head -n 5 examples/html-plain/example.py
result = parse(text).doc result = parse(text).doc
assert result[0].token == tokens.BLANK assert result[0].token == tokens.BLANK
assert result[0].value == expected[0].value assert result[0].value == expected[0].value
assert result[1].attrs.get('language') == 'sh' assert result[1].attrs.get("language") == "sh"
assert result[1].value == expected[1].value assert result[1].value == expected[1].value

View File

@ -7,47 +7,47 @@ from eorg.generate import html
def test_meta_headers(): def test_meta_headers():
text="#+TITLE: test title" text = "#+TITLE: test title"
rx = const.t_META rx = const.t_META
match = re.search(rx, text) match = re.search(rx, text)
assert match is not None assert match is not None
text="#+UNKNOWN: test title" text = "#+UNKNOWN: test title"
rx = const.t_META rx = const.t_META
match = re.search(rx, text) match = re.search(rx, text)
assert match is None assert match is None
text="#+UNKNOWN: test title" text = "#+UNKNOWN: test title"
rx = const.t_META_OTHER rx = const.t_META_OTHER
match = re.search(rx, text) match = re.search(rx, text)
assert match is not None assert match is not None
def test_example(): def test_example():
text="#+BEGIN_EXAMPLE" text = "#+BEGIN_EXAMPLE"
rx = const.t_EXAMPLE_BEGIN rx = const.t_EXAMPLE_BEGIN
match = re.search(rx, text) match = re.search(rx, text)
assert match is not None assert match is not None
text="#+BEGIN_EXAMPLE " text = "#+BEGIN_EXAMPLE "
rx = const.t_EXAMPLE_BEGIN rx = const.t_EXAMPLE_BEGIN
match = re.search(rx, text) match = re.search(rx, text)
assert match is not None assert match is not None
def test_source(): def test_source():
# invalid if no language specified # invalid if no language specified
text="#+BEGIN_SRC" text = "#+BEGIN_SRC"
rx = const.t_SRC_BEGIN rx = const.t_SRC_BEGIN
match = re.search(rx, text) match = re.search(rx, text)
assert match is None assert match is None
text="#+BEGIN_SRC " text = "#+BEGIN_SRC "
rx = const.t_SRC_BEGIN rx = const.t_SRC_BEGIN
match = re.search(rx, text) match = re.search(rx, text)
assert match is not None assert match is not None
text="#+BEGIN_SRC sh :results silent" text = "#+BEGIN_SRC sh :results silent"
rx = const.t_SRC_BEGIN rx = const.t_SRC_BEGIN
match = re.search(rx, text) match = re.search(rx, text)
assert match is not None assert match is not None
@ -55,12 +55,24 @@ def test_source():
def test_bullets(): def test_bullets():
# invalid if no language specified # invalid if no language specified
text=" + bullet 1" text = " + bullet 1"
rx = const.t_BULLET_START rx = const.t_BULLET_START
match = re.search(rx, text) match = re.search(rx, text)
assert match is not None assert match is not None
text="+ bullet 1" text = "+ bullet 1"
rx = const.t_BULLET_START rx = const.t_BULLET_START
match = re.search(rx, text) match = re.search(rx, text)
assert match is not None assert match is not None
def test_captions_regex():
text = "#+CAPTION: Test"
rx = const.t_CAPTIONS
match = re.search(rx, text)
assert match is not None
text = "#+CAPTION:Test"
rx = const.t_CAPTIONS
match = re.search(rx, text)
assert match is not None