Improved text token parsing

This commit is contained in:
Oliver Marks 2018-10-21 21:41:37 +01:00
parent 1371a1a8dc
commit 82ec528406
9 changed files with 210 additions and 29 deletions

View File

@ -1,12 +1,14 @@
import os
import sys import sys
from eorg.parser import parse from eorg.parser import parse
from eorg.generate import html from eorg.generate import html
def tangle(v): def tangle(doc):
print("tangle") print("tangle")
print(v) code = getattr(doc, 'code')
print(code)
def recursive(path): def recursive(path):
for root, dirs, filenames in os.walk(path): for root, dirs, filenames in os.walk(path):
@ -23,11 +25,17 @@ def htmlize(doc):
print(item) print(item)
def handler(fp, kwargs): def handler(fp, kwargs):
if kwargs.t is True: if kwargs.s is True:
tokenize(doc) tokenize(doc)
if kwargs.t is True:
tangle(doc)
if kwargs.w is True: if kwargs.w is True:
print(html(doc).read()) print(html(doc).read())
if kwargs.meta:
values = {}
for item in kwargs.meta:
values[item] = getattr(doc, item)
print(' | '.join([k + ' - ' + v for k,v in values.items()]))
if __name__ == "__main__": if __name__ == "__main__":
import argparse import argparse
@ -35,8 +43,10 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process some .org files") parser = argparse.ArgumentParser(description="Process some .org files")
parser.add_argument("filename") parser.add_argument("filename")
parser.add_argument('-r', action='store_true', help='recursive') parser.add_argument('-r', action='store_true', help='recursive')
parser.add_argument('-w', action='store_true', help='html') parser.add_argument('-w', action='store_true', help='Generate html')
parser.add_argument('-t', action='store_true', help='html') parser.add_argument('-s', action='store_true', help='Document structure')
parser.add_argument('-t', action='store_true', help='Tangle out code')
parser.add_argument('-m', '--meta', action='append', help='Show meta data')
parser.add_argument( parser.add_argument(
"--tangle", "--tangle",
dest="tangle", dest="tangle",
@ -47,12 +57,17 @@ if __name__ == "__main__":
) )
args = parser.parse_args() args = parser.parse_args()
print(args.filename) filename = os.path.abspath(args.filename)
print(args.r)
if args.r is True: if args.r is True:
for filename in recursive(filename):
with open(filename, "r") as fp:
doc = parse(fp)
handler(parse(fp), args)
sys.exit() sys.exit()
with open(args.filename, "r") as fp: with open(filename, "r") as fp:
doc = parse(fp) doc = parse(fp)
handler(parse(fp), args) handler(parse(fp), args)

View File

@ -1,3 +1,5 @@
ESCAPE = ['\n']
METADATA = ['TITLE', 'AUTHOR', 'EMAIL', 'DESCRIPTION', 'KEYWORDS'] METADATA = ['TITLE', 'AUTHOR', 'EMAIL', 'DESCRIPTION', 'KEYWORDS']
t_META = r"^[#]\+(" + '|'.join(METADATA) +")\:" t_META = r"^[#]\+(" + '|'.join(METADATA) +")\:"
t_BLANK_LINE = '^\s*$' t_BLANK_LINE = '^\s*$'
@ -8,6 +10,8 @@ t_EXAMPLE_END = r"^\#\+END_EXAMPLE"
t_SRC_BEGIN = r"^\#\+BEGIN_SRC\s+" t_SRC_BEGIN = r"^\#\+BEGIN_SRC\s+"
t_SRC_END = r"^\#\+END_SRC" t_SRC_END = r"^\#\+END_SRC"
t_RESULTS_START = r"^\#\+RESULTS:" t_RESULTS_START = r"^\#\+RESULTS:"
t_CAPTIONS = r"^\#\+CAPTION:"
t_IMG = r"^\[\[\s]]$"
t_RESULTS_END = r"^\:..*" t_RESULTS_END = r"^\:..*"
t_HEADER = r"^\*+" t_HEADER = r"^\*+"
@ -17,7 +21,21 @@ TOKENS = {
"META": (t_META, False, 2, -1, False), "META": (t_META, False, 2, -1, False),
"COMMENT": (t_COMMENT_BEGIN, t_COMMENT_END, 2, None, False), "COMMENT": (t_COMMENT_BEGIN, t_COMMENT_END, 2, None, False),
"EXAMPLE": (t_EXAMPLE_BEGIN, t_EXAMPLE_END, 2, None, False), "EXAMPLE": (t_EXAMPLE_BEGIN, t_EXAMPLE_END, 2, None, False),
"IMG": (t_IMG, False, 2, None, False),
"CAPTION": (t_CAPTIONS, False, 2, None, False),
"SRC_BEGIN": (t_SRC_BEGIN, t_SRC_END, 2, None, False), "SRC_BEGIN": (t_SRC_BEGIN, t_SRC_END, 2, None, False),
"RESULTS": (t_SRC_BEGIN, t_SRC_END, 2, None, False), "RESULTS": (t_SRC_BEGIN, t_SRC_END, 2, None, False),
"HEADER": (t_HEADER, False, 1, None, True), "HEADER": (t_HEADER, False, 1, None, True),
} }
class Token:
__slots__ = ["token", "value"]
def __init__(self, token, value):
self.token = token
self.value = value
def __repr__(self):
return f"Token(token={self.token}, value={self.value})"

View File

@ -1,39 +1,61 @@
from io import StringIO from io import StringIO
from eorg.const import Token, ESCAPE
from pygments import highlight from pygments import highlight
from pygments.lexers import PythonLexer from pygments.lexers import PythonLexer
from pygments.lexers import get_lexer_by_name from pygments.lexers import get_lexer_by_name
from pygments.formatters import HtmlFormatter from pygments.formatters import HtmlFormatter
def src(code): def src(doc, code, cls=''):
lexer = get_lexer_by_name('lisp') lexer = get_lexer_by_name('lisp')
return highlight(code, lexer, HtmlFormatter()) return highlight(code, lexer, HtmlFormatter())
def img(doc, item, cls=''):
caption = doc.previous('CAPTION')
text = ''
if caption:
text = f'<p class="center-align">{caption.value}</p>'
return f'<img{cls} style="margin:auto;" src="{item[0]}" alt="{item[1]}" />{text}'
def parse_text_html(doc, tokens, cls=''):
if isinstance(tokens, list):
for token in tokens:
return handle_token(doc, token)
return f'<p{cls}>{tokens}</p>'
builddoc ={ builddoc ={
"HEADER1": ("h2", None), "HEADER1": ("h2", None),
"HEADER2": ("h3", None), "HEADER2": ("h3", None),
"HEADER3": ("h4", None), "HEADER3": ("h4", None),
# "BREAK": "br", # "BREAK": "br",
"TEXT": ("p", "flow-text"), "IMG": (img, 'materialboxed center-align responsive-img'),
"B": ("b", None),
"U": ("u", None),
"i": ("i", None),
"TEXT": (parse_text_html, "flow-text"),
"SRC_BEGIN": (src, None), "SRC_BEGIN": (src, None),
"EXAMPLE": ('blockquote', None), "EXAMPLE": ('blockquote', None),
} }
def handle_token(doc, item):
def html(doc):
response = StringIO() response = StringIO()
for item in doc:
match = builddoc.get(item.token) match = builddoc.get(item.token)
if not match: if not match:
continue return ''
tag, cls = match tag, cls = match
if cls: if cls:
cls = f' class="{cls}"' cls = f' class="{cls}"'
else: else:
cls = '' cls = ''
if callable(tag): if callable(tag):
response.write(tag(item.value)) return tag(doc, item.value, cls)
continue
else: else:
response.write('<%s%s>%s</%s>\n' % (tag, cls, item.value, tag)) return '<%s%s>%s</%s>\n' % (tag, cls, item.value, tag)
def html(doc):
response = StringIO()
for item in doc:
response.write(handle_token(doc, item))
response.seek(0) response.seek(0)
return response return response

View File

@ -1,5 +1,5 @@
import re import re
from eorg.const import TOKENS, METADATA from eorg.const import TOKENS, METADATA, ESCAPE
class Token: class Token:
@ -14,6 +14,7 @@ class Token:
class Document: class Document:
pos = 0
doc = [] doc = []
index = {} index = {}
@ -26,7 +27,7 @@ class Document:
if not idx: if not idx:
if default is not None: if default is not None:
return default return default
raise ValueError(f"Attribute of {name} does not exist in document") raise AttributeError(f"Attribute of {name} does not exist in document")
if len(idx) == 1: if len(idx) == 1:
return self.doc[idx[0]].value return self.doc[idx[0]].value
return [self.doc[v].value for v in idx] return [self.doc[v].value for v in idx]
@ -40,8 +41,23 @@ class Document:
self.doc[-1].value += value self.doc[-1].value += value
def __iter__(self): def __iter__(self):
self.pos = 0
for item in self.doc: for item in self.doc:
yield item yield item
self.pos += 1
def previous(self, match):
if self.pos is 0:
return None
if self.doc[self.pos-1].token != match:
return None
return self.doc[self.pos-1]
def filter(self, value):
"""Only return types that are of intrest like source blocks"""
for item in self.doc:
if item.token == value:
yield item
def body(self): def body(self):
for item in self.doc: for item in self.doc:
@ -49,6 +65,16 @@ class Document:
continue continue
yield item yield item
def images(self):
for item in self.__iter__():
if item.token == 'IMG':
yield item.value[0]
if item.token == 'TEXT':
if isinstance(item.value, list):
for token in item.value:
if token.token == 'IMG':
yield token
def __len__(self): def __len__(self):
return len(self.doc) return len(self.doc)
@ -85,6 +111,68 @@ def parseline(text):
return False, Token(token="TEXT", value=text + " ") return False, Token(token="TEXT", value=text + " ")
def parse_text(txt):
char = True
tokens = []
def img(char, step):
if char != '[':
return char
char = next(step, None)
if char != '[':
return char
char = next(step, None)
path = ''
while char not in [']'] + ESCAPE:
path += char
char = next(step, None)
char = next(step, None)
alt = ''
if char == '[':
char = next(step, None)
while char not in [']'] + ESCAPE:
alt += char
char = next(step, None)
char = next(step, None)
tokens.append(Token('IMG', [path, alt]))
return ''
def emphasis(char, step, end='*', tag='b'):
if not char or char!=end:
return char
char = next(step, None)
r = ''
while char and char not in [end] + ESCAPE:
r += char
char = next(step, None)
tokens.append(Token('b', r))
return ''
step = iter(txt)
while char is not None:
char = next(step, None)
char = emphasis(char, step, '*', 'b')
char = emphasis(char, step, '/', 'i')
char = emphasis(char, step, '_', 'u')
char = emphasis(char, step, '=', 'v')
char = emphasis(char, step, '~', 'pre')
char = img(char, step)
if not char:
continue
if len(tokens) == 0:
tokens.append(Token('TEXT', char))
continue
if tokens[-1].token != 'TEXT':
tokens.append(Token('TEXT', char))
continue
tokens[-1].value += char
return tokens
def parse(stream): def parse(stream):
doc = Document() doc = Document()
block = False block = False
@ -103,4 +191,9 @@ def parse(stream):
doc.update(result[1].value) doc.update(result[1].value)
continue continue
doc.append(result[1]) doc.append(result[1])
for item in doc.filter('TEXT'):
#print('@@@@@@@@@@@@@@@@@')
#print(item.value)
item.value = parse_text(item.value)
return doc return doc

View File

@ -1,2 +1,2 @@
__version__ = 0.1 __version__ = 0.2

View File

@ -25,10 +25,10 @@ Simple raw html generation
head -n 5 examples/html-plain/example.py head -n 5 examples/html-plain/example.py
#+END_SRC #+END_SRC
** Enaml web templating language ** Enaml web templating language
Written mainly to try out enaml-web Written mainly to try out enaml-web
#+BEGIN_SRC sh :results output drawer #+BEGIN_SRC sh :results output drawer
head -n 5 examples/html-enaml/example.py head -n 5 examples/html-enaml/example.py
#+END_SRC #+END_SRC

View File

@ -0,0 +1,2 @@
*bold*, /italic/, _underlined_, =verbatim= and ~code~, and, if you must, +strike-through+

View File

@ -11,5 +11,5 @@ def test_fetch_attribute():
def test_fetch_non_existant_attribute(): def test_fetch_non_existant_attribute():
with open(os.path.abspath("./tests/fixtures/test.org"), "r") as fp: with open(os.path.abspath("./tests/fixtures/test.org"), "r") as fp:
doc = parse(fp) doc = parse(fp)
with pytest.raises(ValueError): with pytest.raises(AttributeError):
doc.fake doc.fake

31
tests/test_html.py Normal file
View File

@ -0,0 +1,31 @@
import os
import pytest
from eorg.parser import Token
from eorg.parser import parse
from eorg.parser import parse_text
def test_emphasis():
text = "parse emphasis *bold text* _underlined text_ /italic text/ normal text"
expected = [Token(token='TEXT', value='parse emphasis ' ), Token(token='b', value='bold text'), Token(token='TEXT', value=' ' ), Token(token='b', value='underlined text'), Token(token='TEXT', value=' ' ), Token(token='b', value='italic text'), Token('TEXT', ' normal text')]
result = parse_text(text)
assert expected[0].value == result[0].value
assert expected[1].value == result[1].value
assert expected[2].value == result[2].value
assert expected[3].value == result[3].value
assert expected[4].value == result[4].value
assert expected[5].value == result[5].value
assert expected[6].value == result[6].value
def test_image():
text = "parse image [[../../test.jpg][test]] after image"
expected = [
Token("TEXT", "parse image "),
Token("IMG", ["../../test.jpg", "test"]),
Token("TEXT", " after image"),
]
result = parse_text(text)
assert result[0].value == expected[0].value
assert result[1].value == expected[1].value
assert result[2].value == expected[2].value