Working on improved image parsing for captions

2018-11-27 14:19:25 +00:00 · 2018-11-27 14:19:25 +00:00 · 17994c2eb7
parent ac899b8ada
commit 17994c2eb7
4 changed files with 108 additions and 16 deletions
--- a/eorg/helper.py
+++ b/eorg/helper.py
@ -0,0 +1,34 @@
 from eorg import tokens
 from eorg.tokens import Token
 from eorg.const import ESCAPE, image_extensions
 def parse_img_or_link(char, step):
    if char != "[":
        return char, None
    char = next(step, None)
    if char != "[":
        return char, None
    char = next(step, None)
    path = ""
    while char not in ["]"] + ESCAPE:
        path += char
        char = next(step, None)
    char = next(step, None)
    alt = ""
    if char == "[":
        char = next(step, None)
        while char not in ["]"] + ESCAPE:
            alt += char
            char = next(step, None)
        char = next(step, None)
    if path.endswith(image_extensions):
        return "", Token(tokens.IMAGE, [path, alt])
    return "", Token(tokens.LINK, [path, alt])
--- a/eorg/parser.py
+++ b/eorg/parser.py
@ -2,6 +2,7 @@ import re
 from eorg import tokens
 from eorg.tokens import Token
 from eorg.const import TOKENS, METADATA, ESCAPE, image_extensions
 from eorg.helper import parse_img_or_link
 class Document:
@ -18,16 +19,20 @@ class Document:
        if not idx:
            if default is not None:
                return default
            raise AttributeError(
                f"Attribute of {name} does not exist in document"
            )
        if len(idx) == 1:
            return self.doc[idx[0]].value
        return [self.doc[v].value for v in idx]
    def token(self):
        if self.doc:
            return self.doc[-1].token
        return ""
    def update(self, value):
@ -37,13 +42,16 @@ class Document:
        self.pos = 0
        for item in self.doc:
            yield item
            self.pos += 1
    def previous(self, match):
        if self.pos is 0:
            return None
        if self.doc[self.pos - 1].token != match:
            return None
        return self.doc[self.pos - 1]
    def filter(self, value):
@ -56,12 +64,14 @@ class Document:
        for item in self.doc:
            if item.token in METADATA:
                continue
            yield item
    def images(self):
        for item in self.__iter__():
            if item.token == tokens.IMAGE:
                yield item
            if isinstance(item.value, list):
                for token in item.value:
                    if token.token == tokens.IMAGE:
@ -89,6 +99,7 @@ def parsebody(text, rx):
    match = re.search(rx, text)
    if match:
        return False, None
    return rx, text + "\n"
@ -98,23 +109,29 @@ def parseline(text):
        match = re.search(rx, text)
        if not match:
            continue
-        value = text[match.end() :]
+
        value = text[match.end():]
        level = len(match.group(0))
        if count is True:
            attrs = {"depth": level}
        if key == tokens.META:
            return (block, Token(token=match.group(0)[s:e], value=value))
        if key == tokens.SOURCE:
            return block, Token(token=key, attrs=parse_attrs(value))
        if key == tokens.TABLE:
            return block, Token(token=key, value=text + "\n")
        if key == tokens.BULLET:
            return block, Token(token=key, value=text + "\n")
        return block, Token(token=key, value=value, attrs=attrs)
    text = text.strip()
    if text == "":
        return False, Token(token=tokens.BLANK, value=text)
    return False, Token(token=tokens.LIST, value=text + " ")
@ -125,10 +142,12 @@ def parse_text(txt):
    def img(char, step):
        if char != "[":
            return char
        char = next(step, None)
        if char != "[":
            return char
        char = next(step, None)
        path = ""
@ -155,6 +174,7 @@ def parse_text(txt):
    def emphasis(char, step, end, tag):
        if not char or char != end:
            return char
        char = next(step, None)
        r = ""
        while char and char not in [end] + ESCAPE:
@ -171,15 +191,21 @@ def parse_text(txt):
        char = emphasis(char, step, "_", tokens.UNDERLINED)
        char = emphasis(char, step, "=", tokens.VERBATIM)
        char = emphasis(char, step, "~", "PRE")
-        char = img(char, step)
+        #char = img(char, step)
        char, token = parse_img_or_link(char, step)
        if token:
            tokenlist.append(token)
        if not char:
            continue
        if len(tokenlist) == 0:
            tokenlist.append(Token(tokens.TEXT, char))
            continue
        if tokenlist[-1].token != tokens.TEXT:
            tokenlist.append(Token(tokens.TEXT, char))
            continue
        tokenlist[-1].value += char
    return tokenlist
@ -194,11 +220,13 @@ def parse(stream):
            if block:
                doc.update(token)
            continue
        block, token = parseline(line)
        if token:
            if doc.token() == tokens.LIST and token.token == tokens.LIST:
                doc.update(token.value)
                continue
            doc.append(token)
    for item in doc.filter(tokens.LIST):
--- a/tests/test_document_parsing.py
+++ b/tests/test_document_parsing.py
@ -49,12 +49,30 @@ def test_image():
    text = "[[../../../images/opengl/point-sprite-shader.png]]"
    expected = [
-        Token(tokens.IMAGE, ["../../../images/opengl/point-sprite-shader.png", ""]),
+        Token(
            tokens.IMAGE,
            ["../../../images/opengl/point-sprite-shader.png", ""],
        )
    ]
-    result = parse_text(text)
+    result = parse(text).doc
    assert result[0].value == expected[0].value
 def test_image_with_caption():
    text = StringIO("""#+CAPTION: Test Image
 text  [[../../test.jpg][test]]""")
    expected = [
        Token(tokens.CAPTION, " Test Image"),
        Token(tokens.LIST, [Token(tokens.IMAGE, ["../../test.jpg", "test"])]),
    ]
    result = parse(text).doc
    assert len(result) == 2
    assert result[0].token == expected[0].token
    assert result[0].value == expected[0].value
    assert result[1].token == expected[1].token
    assert result[1].value == expected[1].value
 def test_link():
    text = "parse link [[../../test.html][test]] after link"
    expected = [
@ -108,7 +126,7 @@ head -n 5 examples/html-plain/example.py
    result = parse(text).doc
    assert result[0].token == tokens.BLANK
    assert result[0].value == expected[0].value
-    assert result[1].attrs.get('language') == 'sh'
+    assert result[1].attrs.get("language") == "sh"
    assert result[1].value == expected[1].value
--- a/tests/test_regex.py
+++ b/tests/test_regex.py
@ -7,47 +7,47 @@ from eorg.generate import html
 def test_meta_headers():
-    text="#+TITLE: test title"
+    text = "#+TITLE: test title"
    rx = const.t_META
    match = re.search(rx, text)
    assert match is not None
-    text="#+UNKNOWN: test title"
+    text = "#+UNKNOWN: test title"
    rx = const.t_META
    match = re.search(rx, text)
    assert match is None
-    text="#+UNKNOWN: test title"
+    text = "#+UNKNOWN: test title"
    rx = const.t_META_OTHER
    match = re.search(rx, text)
    assert match is not None
 def test_example():
-    text="#+BEGIN_EXAMPLE"
+    text = "#+BEGIN_EXAMPLE"
    rx = const.t_EXAMPLE_BEGIN
    match = re.search(rx, text)
    assert match is not None
-    text="#+BEGIN_EXAMPLE "
+    text = "#+BEGIN_EXAMPLE "
    rx = const.t_EXAMPLE_BEGIN
    match = re.search(rx, text)
    assert match is not None
 def test_source():
    # invalid if no language specified
-    text="#+BEGIN_SRC"
+    text = "#+BEGIN_SRC"
    rx = const.t_SRC_BEGIN
    match = re.search(rx, text)
    assert match is None
-    text="#+BEGIN_SRC "
+    text = "#+BEGIN_SRC "
    rx = const.t_SRC_BEGIN
    match = re.search(rx, text)
    assert match is not None
-    text="#+BEGIN_SRC sh :results silent"
+    text = "#+BEGIN_SRC sh :results silent"
    rx = const.t_SRC_BEGIN
    match = re.search(rx, text)
    assert match is not None
@ -55,12 +55,24 @@ def test_source():
 def test_bullets():
    # invalid if no language specified
-    text=" + bullet 1"
+    text = " + bullet 1"
    rx = const.t_BULLET_START
    match = re.search(rx, text)
    assert match is not None
-    text="+ bullet 1"
+    text = "+ bullet 1"
    rx = const.t_BULLET_START
    match = re.search(rx, text)
    assert match is not None
 def test_captions_regex():
    text = "#+CAPTION: Test"
    rx = const.t_CAPTIONS
    match = re.search(rx, text)
    assert match is not None
    text = "#+CAPTION:Test"
    rx = const.t_CAPTIONS
    match = re.search(rx, text)
    assert match is not None