From 17994c2eb7156758680a076926dc7c202eabb85e Mon Sep 17 00:00:00 2001 From: Oly Date: Tue, 27 Nov 2018 14:19:25 +0000 Subject: [PATCH] Working on improved image parsing for captions --- eorg/helper.py | 34 ++++++++++++++++++++++++++++++++++ eorg/parser.py | 32 ++++++++++++++++++++++++++++++-- tests/test_document_parsing.py | 24 +++++++++++++++++++++--- tests/test_regex.py | 34 +++++++++++++++++++++++----------- 4 files changed, 108 insertions(+), 16 deletions(-) create mode 100644 eorg/helper.py diff --git a/eorg/helper.py b/eorg/helper.py new file mode 100644 index 0000000..2a70710 --- /dev/null +++ b/eorg/helper.py @@ -0,0 +1,34 @@ +from eorg import tokens +from eorg.tokens import Token +from eorg.const import ESCAPE, image_extensions + + +def parse_img_or_link(char, step): + if char != "[": + return char, None + + char = next(step, None) + + if char != "[": + return char, None + + char = next(step, None) + + path = "" + while char not in ["]"] + ESCAPE: + path += char + char = next(step, None) + char = next(step, None) + + alt = "" + if char == "[": + char = next(step, None) + while char not in ["]"] + ESCAPE: + alt += char + char = next(step, None) + char = next(step, None) + + if path.endswith(image_extensions): + return "", Token(tokens.IMAGE, [path, alt]) + + return "", Token(tokens.LINK, [path, alt]) diff --git a/eorg/parser.py b/eorg/parser.py index 0e9f15f..741a60d 100644 --- a/eorg/parser.py +++ b/eorg/parser.py @@ -2,6 +2,7 @@ import re from eorg import tokens from eorg.tokens import Token from eorg.const import TOKENS, METADATA, ESCAPE, image_extensions +from eorg.helper import parse_img_or_link class Document: @@ -18,16 +19,20 @@ class Document: if not idx: if default is not None: return default + raise AttributeError( f"Attribute of {name} does not exist in document" ) + if len(idx) == 1: return self.doc[idx[0]].value + return [self.doc[v].value for v in idx] def token(self): if self.doc: return self.doc[-1].token + return "" def update(self, value): @@ -37,13 +42,16 @@ class Document: self.pos = 0 for item in self.doc: yield item + self.pos += 1 def previous(self, match): if self.pos is 0: return None + if self.doc[self.pos - 1].token != match: return None + return self.doc[self.pos - 1] def filter(self, value): @@ -56,12 +64,14 @@ class Document: for item in self.doc: if item.token in METADATA: continue + yield item def images(self): for item in self.__iter__(): if item.token == tokens.IMAGE: yield item + if isinstance(item.value, list): for token in item.value: if token.token == tokens.IMAGE: @@ -89,6 +99,7 @@ def parsebody(text, rx): match = re.search(rx, text) if match: return False, None + return rx, text + "\n" @@ -98,23 +109,29 @@ def parseline(text): match = re.search(rx, text) if not match: continue - value = text[match.end() :] + + value = text[match.end():] level = len(match.group(0)) if count is True: attrs = {"depth": level} if key == tokens.META: return (block, Token(token=match.group(0)[s:e], value=value)) + if key == tokens.SOURCE: return block, Token(token=key, attrs=parse_attrs(value)) + if key == tokens.TABLE: return block, Token(token=key, value=text + "\n") + if key == tokens.BULLET: return block, Token(token=key, value=text + "\n") + return block, Token(token=key, value=value, attrs=attrs) text = text.strip() if text == "": return False, Token(token=tokens.BLANK, value=text) + return False, Token(token=tokens.LIST, value=text + " ") @@ -125,10 +142,12 @@ def parse_text(txt): def img(char, step): if char != "[": return char + char = next(step, None) if char != "[": return char + char = next(step, None) path = "" @@ -155,6 +174,7 @@ def parse_text(txt): def emphasis(char, step, end, tag): if not char or char != end: return char + char = next(step, None) r = "" while char and char not in [end] + ESCAPE: @@ -171,15 +191,21 @@ def parse_text(txt): char = emphasis(char, step, "_", tokens.UNDERLINED) char = emphasis(char, step, "=", tokens.VERBATIM) char = emphasis(char, step, "~", "PRE") - char = img(char, step) + #char = img(char, step) + char, token = parse_img_or_link(char, step) + if token: + tokenlist.append(token) if not char: continue + if len(tokenlist) == 0: tokenlist.append(Token(tokens.TEXT, char)) continue + if tokenlist[-1].token != tokens.TEXT: tokenlist.append(Token(tokens.TEXT, char)) continue + tokenlist[-1].value += char return tokenlist @@ -194,11 +220,13 @@ def parse(stream): if block: doc.update(token) continue + block, token = parseline(line) if token: if doc.token() == tokens.LIST and token.token == tokens.LIST: doc.update(token.value) continue + doc.append(token) for item in doc.filter(tokens.LIST): diff --git a/tests/test_document_parsing.py b/tests/test_document_parsing.py index f8695e6..5f243b7 100644 --- a/tests/test_document_parsing.py +++ b/tests/test_document_parsing.py @@ -49,12 +49,30 @@ def test_image(): text = "[[../../../images/opengl/point-sprite-shader.png]]" expected = [ - Token(tokens.IMAGE, ["../../../images/opengl/point-sprite-shader.png", ""]), + Token( + tokens.IMAGE, + ["../../../images/opengl/point-sprite-shader.png", ""], + ) ] - result = parse_text(text) + result = parse(text).doc assert result[0].value == expected[0].value +def test_image_with_caption(): + text = StringIO("""#+CAPTION: Test Image +text [[../../test.jpg][test]]""") + expected = [ + Token(tokens.CAPTION, " Test Image"), + Token(tokens.LIST, [Token(tokens.IMAGE, ["../../test.jpg", "test"])]), + ] + result = parse(text).doc + assert len(result) == 2 + assert result[0].token == expected[0].token + assert result[0].value == expected[0].value + assert result[1].token == expected[1].token + assert result[1].value == expected[1].value + + def test_link(): text = "parse link [[../../test.html][test]] after link" expected = [ @@ -108,7 +126,7 @@ head -n 5 examples/html-plain/example.py result = parse(text).doc assert result[0].token == tokens.BLANK assert result[0].value == expected[0].value - assert result[1].attrs.get('language') == 'sh' + assert result[1].attrs.get("language") == "sh" assert result[1].value == expected[1].value diff --git a/tests/test_regex.py b/tests/test_regex.py index 33ce540..f80c4a1 100644 --- a/tests/test_regex.py +++ b/tests/test_regex.py @@ -7,47 +7,47 @@ from eorg.generate import html def test_meta_headers(): - text="#+TITLE: test title" + text = "#+TITLE: test title" rx = const.t_META match = re.search(rx, text) assert match is not None - text="#+UNKNOWN: test title" + text = "#+UNKNOWN: test title" rx = const.t_META match = re.search(rx, text) assert match is None - text="#+UNKNOWN: test title" + text = "#+UNKNOWN: test title" rx = const.t_META_OTHER match = re.search(rx, text) assert match is not None + def test_example(): - text="#+BEGIN_EXAMPLE" + text = "#+BEGIN_EXAMPLE" rx = const.t_EXAMPLE_BEGIN match = re.search(rx, text) assert match is not None - text="#+BEGIN_EXAMPLE " + text = "#+BEGIN_EXAMPLE " rx = const.t_EXAMPLE_BEGIN match = re.search(rx, text) assert match is not None - def test_source(): # invalid if no language specified - text="#+BEGIN_SRC" + text = "#+BEGIN_SRC" rx = const.t_SRC_BEGIN match = re.search(rx, text) assert match is None - text="#+BEGIN_SRC " + text = "#+BEGIN_SRC " rx = const.t_SRC_BEGIN match = re.search(rx, text) assert match is not None - text="#+BEGIN_SRC sh :results silent" + text = "#+BEGIN_SRC sh :results silent" rx = const.t_SRC_BEGIN match = re.search(rx, text) assert match is not None @@ -55,12 +55,24 @@ def test_source(): def test_bullets(): # invalid if no language specified - text=" + bullet 1" + text = " + bullet 1" rx = const.t_BULLET_START match = re.search(rx, text) assert match is not None - text="+ bullet 1" + text = "+ bullet 1" rx = const.t_BULLET_START match = re.search(rx, text) assert match is not None + + +def test_captions_regex(): + text = "#+CAPTION: Test" + rx = const.t_CAPTIONS + match = re.search(rx, text) + assert match is not None + + text = "#+CAPTION:Test" + rx = const.t_CAPTIONS + match = re.search(rx, text) + assert match is not None