Improved text token parsing

This commit is contained in:
Oliver Marks 2018-10-21 21:41:37 +01:00
parent 1371a1a8dc
commit 82ec528406
9 changed files with 210 additions and 29 deletions

View File

@ -1,12 +1,14 @@
import os
import sys
from eorg.parser import parse
from eorg.generate import html
def tangle(v):
def tangle(doc):
print("tangle")
print(v)
code = getattr(doc, 'code')
print(code)
def recursive(path):
for root, dirs, filenames in os.walk(path):
@ -23,11 +25,17 @@ def htmlize(doc):
print(item)
def handler(fp, kwargs):
if kwargs.t is True:
if kwargs.s is True:
tokenize(doc)
if kwargs.t is True:
tangle(doc)
if kwargs.w is True:
print(html(doc).read())
if kwargs.meta:
values = {}
for item in kwargs.meta:
values[item] = getattr(doc, item)
print(' | '.join([k + ' - ' + v for k,v in values.items()]))
if __name__ == "__main__":
import argparse
@ -35,8 +43,10 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process some .org files")
parser.add_argument("filename")
parser.add_argument('-r', action='store_true', help='recursive')
parser.add_argument('-w', action='store_true', help='html')
parser.add_argument('-t', action='store_true', help='html')
parser.add_argument('-w', action='store_true', help='Generate html')
parser.add_argument('-s', action='store_true', help='Document structure')
parser.add_argument('-t', action='store_true', help='Tangle out code')
parser.add_argument('-m', '--meta', action='append', help='Show meta data')
parser.add_argument(
"--tangle",
dest="tangle",
@ -47,12 +57,17 @@ if __name__ == "__main__":
)
args = parser.parse_args()
print(args.filename)
print(args.r)
filename = os.path.abspath(args.filename)
if args.r is True:
for filename in recursive(filename):
with open(filename, "r") as fp:
doc = parse(fp)
handler(parse(fp), args)
sys.exit()
with open(args.filename, "r") as fp:
with open(filename, "r") as fp:
doc = parse(fp)
handler(parse(fp), args)

View File

@ -1,3 +1,5 @@
ESCAPE = ['\n']
METADATA = ['TITLE', 'AUTHOR', 'EMAIL', 'DESCRIPTION', 'KEYWORDS']
t_META = r"^[#]\+(" + '|'.join(METADATA) +")\:"
t_BLANK_LINE = '^\s*$'
@ -8,6 +10,8 @@ t_EXAMPLE_END = r"^\#\+END_EXAMPLE"
t_SRC_BEGIN = r"^\#\+BEGIN_SRC\s+"
t_SRC_END = r"^\#\+END_SRC"
t_RESULTS_START = r"^\#\+RESULTS:"
t_CAPTIONS = r"^\#\+CAPTION:"
t_IMG = r"^\[\[\s]]$"
t_RESULTS_END = r"^\:..*"
t_HEADER = r"^\*+"
@ -17,7 +21,21 @@ TOKENS = {
"META": (t_META, False, 2, -1, False),
"COMMENT": (t_COMMENT_BEGIN, t_COMMENT_END, 2, None, False),
"EXAMPLE": (t_EXAMPLE_BEGIN, t_EXAMPLE_END, 2, None, False),
"IMG": (t_IMG, False, 2, None, False),
"CAPTION": (t_CAPTIONS, False, 2, None, False),
"SRC_BEGIN": (t_SRC_BEGIN, t_SRC_END, 2, None, False),
"RESULTS": (t_SRC_BEGIN, t_SRC_END, 2, None, False),
"HEADER": (t_HEADER, False, 1, None, True),
}
class Token:
__slots__ = ["token", "value"]
def __init__(self, token, value):
self.token = token
self.value = value
def __repr__(self):
return f"Token(token={self.token}, value={self.value})"

View File

@ -1,39 +1,61 @@
from io import StringIO
from eorg.const import Token, ESCAPE
from pygments import highlight
from pygments.lexers import PythonLexer
from pygments.lexers import get_lexer_by_name
from pygments.formatters import HtmlFormatter
def src(code):
def src(doc, code, cls=''):
lexer = get_lexer_by_name('lisp')
return highlight(code, lexer, HtmlFormatter())
def img(doc, item, cls=''):
caption = doc.previous('CAPTION')
text = ''
if caption:
text = f'<p class="center-align">{caption.value}</p>'
return f'<img{cls} style="margin:auto;" src="{item[0]}" alt="{item[1]}" />{text}'
def parse_text_html(doc, tokens, cls=''):
if isinstance(tokens, list):
for token in tokens:
return handle_token(doc, token)
return f'<p{cls}>{tokens}</p>'
builddoc ={
"HEADER1": ("h2", None),
"HEADER2": ("h3", None),
"HEADER3": ("h4", None),
# "BREAK": "br",
"TEXT": ("p", "flow-text"),
"IMG": (img, 'materialboxed center-align responsive-img'),
"B": ("b", None),
"U": ("u", None),
"i": ("i", None),
"TEXT": (parse_text_html, "flow-text"),
"SRC_BEGIN": (src, None),
"EXAMPLE": ('blockquote', None),
}
def html(doc):
def handle_token(doc, item):
response = StringIO()
for item in doc:
match = builddoc.get(item.token)
if not match:
continue
return ''
tag, cls = match
if cls:
cls = f' class="{cls}"'
else:
cls = ''
if callable(tag):
response.write(tag(item.value))
continue
return tag(doc, item.value, cls)
else:
response.write('<%s%s>%s</%s>\n' % (tag, cls, item.value, tag))
return '<%s%s>%s</%s>\n' % (tag, cls, item.value, tag)
def html(doc):
response = StringIO()
for item in doc:
response.write(handle_token(doc, item))
response.seek(0)
return response

View File

@ -1,5 +1,5 @@
import re
from eorg.const import TOKENS, METADATA
from eorg.const import TOKENS, METADATA, ESCAPE
class Token:
@ -14,6 +14,7 @@ class Token:
class Document:
pos = 0
doc = []
index = {}
@ -26,7 +27,7 @@ class Document:
if not idx:
if default is not None:
return default
raise ValueError(f"Attribute of {name} does not exist in document")
raise AttributeError(f"Attribute of {name} does not exist in document")
if len(idx) == 1:
return self.doc[idx[0]].value
return [self.doc[v].value for v in idx]
@ -40,8 +41,23 @@ class Document:
self.doc[-1].value += value
def __iter__(self):
self.pos = 0
for item in self.doc:
yield item
self.pos += 1
def previous(self, match):
if self.pos is 0:
return None
if self.doc[self.pos-1].token != match:
return None
return self.doc[self.pos-1]
def filter(self, value):
"""Only return types that are of intrest like source blocks"""
for item in self.doc:
if item.token == value:
yield item
def body(self):
for item in self.doc:
@ -49,6 +65,16 @@ class Document:
continue
yield item
def images(self):
for item in self.__iter__():
if item.token == 'IMG':
yield item.value[0]
if item.token == 'TEXT':
if isinstance(item.value, list):
for token in item.value:
if token.token == 'IMG':
yield token
def __len__(self):
return len(self.doc)
@ -85,6 +111,68 @@ def parseline(text):
return False, Token(token="TEXT", value=text + " ")
def parse_text(txt):
char = True
tokens = []
def img(char, step):
if char != '[':
return char
char = next(step, None)
if char != '[':
return char
char = next(step, None)
path = ''
while char not in [']'] + ESCAPE:
path += char
char = next(step, None)
char = next(step, None)
alt = ''
if char == '[':
char = next(step, None)
while char not in [']'] + ESCAPE:
alt += char
char = next(step, None)
char = next(step, None)
tokens.append(Token('IMG', [path, alt]))
return ''
def emphasis(char, step, end='*', tag='b'):
if not char or char!=end:
return char
char = next(step, None)
r = ''
while char and char not in [end] + ESCAPE:
r += char
char = next(step, None)
tokens.append(Token('b', r))
return ''
step = iter(txt)
while char is not None:
char = next(step, None)
char = emphasis(char, step, '*', 'b')
char = emphasis(char, step, '/', 'i')
char = emphasis(char, step, '_', 'u')
char = emphasis(char, step, '=', 'v')
char = emphasis(char, step, '~', 'pre')
char = img(char, step)
if not char:
continue
if len(tokens) == 0:
tokens.append(Token('TEXT', char))
continue
if tokens[-1].token != 'TEXT':
tokens.append(Token('TEXT', char))
continue
tokens[-1].value += char
return tokens
def parse(stream):
doc = Document()
block = False
@ -103,4 +191,9 @@ def parse(stream):
doc.update(result[1].value)
continue
doc.append(result[1])
for item in doc.filter('TEXT'):
#print('@@@@@@@@@@@@@@@@@')
#print(item.value)
item.value = parse_text(item.value)
return doc

View File

@ -1,2 +1,2 @@
__version__ = 0.1
__version__ = 0.2

View File

@ -25,10 +25,10 @@ Simple raw html generation
head -n 5 examples/html-plain/example.py
#+END_SRC
** Enaml web templating language
Written mainly to try out enaml-web
#+BEGIN_SRC sh :results output drawer
head -n 5 examples/html-enaml/example.py
#+END_SRC

View File

@ -0,0 +1,2 @@
*bold*, /italic/, _underlined_, =verbatim= and ~code~, and, if you must, +strike-through+

View File

@ -11,5 +11,5 @@ def test_fetch_attribute():
def test_fetch_non_existant_attribute():
with open(os.path.abspath("./tests/fixtures/test.org"), "r") as fp:
doc = parse(fp)
with pytest.raises(ValueError):
with pytest.raises(AttributeError):
doc.fake

31
tests/test_html.py Normal file
View File

@ -0,0 +1,31 @@
import os
import pytest
from eorg.parser import Token
from eorg.parser import parse
from eorg.parser import parse_text
def test_emphasis():
text = "parse emphasis *bold text* _underlined text_ /italic text/ normal text"
expected = [Token(token='TEXT', value='parse emphasis ' ), Token(token='b', value='bold text'), Token(token='TEXT', value=' ' ), Token(token='b', value='underlined text'), Token(token='TEXT', value=' ' ), Token(token='b', value='italic text'), Token('TEXT', ' normal text')]
result = parse_text(text)
assert expected[0].value == result[0].value
assert expected[1].value == result[1].value
assert expected[2].value == result[2].value
assert expected[3].value == result[3].value
assert expected[4].value == result[4].value
assert expected[5].value == result[5].value
assert expected[6].value == result[6].value
def test_image():
text = "parse image [[../../test.jpg][test]] after image"
expected = [
Token("TEXT", "parse image "),
Token("IMG", ["../../test.jpg", "test"]),
Token("TEXT", " after image"),
]
result = parse_text(text)
assert result[0].value == expected[0].value
assert result[1].value == expected[1].value
assert result[2].value == expected[2].value