Improved text token parsing
This commit is contained in:
parent
1371a1a8dc
commit
82ec528406
33
eorg/cmd.py
33
eorg/cmd.py
|
@ -1,12 +1,14 @@
|
|||
import os
|
||||
import sys
|
||||
from eorg.parser import parse
|
||||
from eorg.generate import html
|
||||
|
||||
|
||||
|
||||
def tangle(v):
|
||||
def tangle(doc):
|
||||
print("tangle")
|
||||
print(v)
|
||||
code = getattr(doc, 'code')
|
||||
print(code)
|
||||
|
||||
def recursive(path):
|
||||
for root, dirs, filenames in os.walk(path):
|
||||
|
@ -23,11 +25,17 @@ def htmlize(doc):
|
|||
print(item)
|
||||
|
||||
def handler(fp, kwargs):
|
||||
if kwargs.t is True:
|
||||
if kwargs.s is True:
|
||||
tokenize(doc)
|
||||
if kwargs.t is True:
|
||||
tangle(doc)
|
||||
if kwargs.w is True:
|
||||
print(html(doc).read())
|
||||
|
||||
if kwargs.meta:
|
||||
values = {}
|
||||
for item in kwargs.meta:
|
||||
values[item] = getattr(doc, item)
|
||||
print(' | '.join([k + ' - ' + v for k,v in values.items()]))
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
@ -35,8 +43,10 @@ if __name__ == "__main__":
|
|||
parser = argparse.ArgumentParser(description="Process some .org files")
|
||||
parser.add_argument("filename")
|
||||
parser.add_argument('-r', action='store_true', help='recursive')
|
||||
parser.add_argument('-w', action='store_true', help='html')
|
||||
parser.add_argument('-t', action='store_true', help='html')
|
||||
parser.add_argument('-w', action='store_true', help='Generate html')
|
||||
parser.add_argument('-s', action='store_true', help='Document structure')
|
||||
parser.add_argument('-t', action='store_true', help='Tangle out code')
|
||||
parser.add_argument('-m', '--meta', action='append', help='Show meta data')
|
||||
parser.add_argument(
|
||||
"--tangle",
|
||||
dest="tangle",
|
||||
|
@ -47,12 +57,17 @@ if __name__ == "__main__":
|
|||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
print(args.filename)
|
||||
print(args.r)
|
||||
filename = os.path.abspath(args.filename)
|
||||
|
||||
|
||||
if args.r is True:
|
||||
for filename in recursive(filename):
|
||||
with open(filename, "r") as fp:
|
||||
doc = parse(fp)
|
||||
handler(parse(fp), args)
|
||||
sys.exit()
|
||||
|
||||
with open(args.filename, "r") as fp:
|
||||
with open(filename, "r") as fp:
|
||||
doc = parse(fp)
|
||||
handler(parse(fp), args)
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
ESCAPE = ['\n']
|
||||
|
||||
METADATA = ['TITLE', 'AUTHOR', 'EMAIL', 'DESCRIPTION', 'KEYWORDS']
|
||||
t_META = r"^[#]\+(" + '|'.join(METADATA) +")\:"
|
||||
t_BLANK_LINE = '^\s*$'
|
||||
|
@ -8,6 +10,8 @@ t_EXAMPLE_END = r"^\#\+END_EXAMPLE"
|
|||
t_SRC_BEGIN = r"^\#\+BEGIN_SRC\s+"
|
||||
t_SRC_END = r"^\#\+END_SRC"
|
||||
t_RESULTS_START = r"^\#\+RESULTS:"
|
||||
t_CAPTIONS = r"^\#\+CAPTION:"
|
||||
t_IMG = r"^\[\[\s]]$"
|
||||
t_RESULTS_END = r"^\:..*"
|
||||
|
||||
t_HEADER = r"^\*+"
|
||||
|
@ -17,7 +21,21 @@ TOKENS = {
|
|||
"META": (t_META, False, 2, -1, False),
|
||||
"COMMENT": (t_COMMENT_BEGIN, t_COMMENT_END, 2, None, False),
|
||||
"EXAMPLE": (t_EXAMPLE_BEGIN, t_EXAMPLE_END, 2, None, False),
|
||||
"IMG": (t_IMG, False, 2, None, False),
|
||||
"CAPTION": (t_CAPTIONS, False, 2, None, False),
|
||||
"SRC_BEGIN": (t_SRC_BEGIN, t_SRC_END, 2, None, False),
|
||||
"RESULTS": (t_SRC_BEGIN, t_SRC_END, 2, None, False),
|
||||
"HEADER": (t_HEADER, False, 1, None, True),
|
||||
}
|
||||
|
||||
|
||||
class Token:
|
||||
__slots__ = ["token", "value"]
|
||||
|
||||
def __init__(self, token, value):
|
||||
self.token = token
|
||||
self.value = value
|
||||
|
||||
def __repr__(self):
|
||||
return f"Token(token={self.token}, value={self.value})"
|
||||
|
||||
|
|
|
@ -1,39 +1,61 @@
|
|||
from io import StringIO
|
||||
from eorg.const import Token, ESCAPE
|
||||
from pygments import highlight
|
||||
from pygments.lexers import PythonLexer
|
||||
from pygments.lexers import get_lexer_by_name
|
||||
from pygments.formatters import HtmlFormatter
|
||||
|
||||
def src(code):
|
||||
def src(doc, code, cls=''):
|
||||
lexer = get_lexer_by_name('lisp')
|
||||
return highlight(code, lexer, HtmlFormatter())
|
||||
|
||||
def img(doc, item, cls=''):
|
||||
caption = doc.previous('CAPTION')
|
||||
text = ''
|
||||
if caption:
|
||||
text = f'<p class="center-align">{caption.value}</p>'
|
||||
return f'<img{cls} style="margin:auto;" src="{item[0]}" alt="{item[1]}" />{text}'
|
||||
|
||||
|
||||
def parse_text_html(doc, tokens, cls=''):
|
||||
if isinstance(tokens, list):
|
||||
for token in tokens:
|
||||
return handle_token(doc, token)
|
||||
return f'<p{cls}>{tokens}</p>'
|
||||
|
||||
builddoc ={
|
||||
"HEADER1": ("h2", None),
|
||||
"HEADER2": ("h3", None),
|
||||
"HEADER3": ("h4", None),
|
||||
# "BREAK": "br",
|
||||
"TEXT": ("p", "flow-text"),
|
||||
"IMG": (img, 'materialboxed center-align responsive-img'),
|
||||
"B": ("b", None),
|
||||
"U": ("u", None),
|
||||
"i": ("i", None),
|
||||
"TEXT": (parse_text_html, "flow-text"),
|
||||
"SRC_BEGIN": (src, None),
|
||||
"EXAMPLE": ('blockquote', None),
|
||||
}
|
||||
|
||||
|
||||
def html(doc):
|
||||
def handle_token(doc, item):
|
||||
response = StringIO()
|
||||
for item in doc:
|
||||
match = builddoc.get(item.token)
|
||||
if not match:
|
||||
continue
|
||||
return ''
|
||||
tag, cls = match
|
||||
if cls:
|
||||
cls = f' class="{cls}"'
|
||||
else:
|
||||
cls = ''
|
||||
if callable(tag):
|
||||
response.write(tag(item.value))
|
||||
continue
|
||||
return tag(doc, item.value, cls)
|
||||
else:
|
||||
response.write('<%s%s>%s</%s>\n' % (tag, cls, item.value, tag))
|
||||
return '<%s%s>%s</%s>\n' % (tag, cls, item.value, tag)
|
||||
|
||||
|
||||
def html(doc):
|
||||
response = StringIO()
|
||||
for item in doc:
|
||||
response.write(handle_token(doc, item))
|
||||
response.seek(0)
|
||||
return response
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import re
|
||||
from eorg.const import TOKENS, METADATA
|
||||
from eorg.const import TOKENS, METADATA, ESCAPE
|
||||
|
||||
|
||||
class Token:
|
||||
|
@ -14,6 +14,7 @@ class Token:
|
|||
|
||||
|
||||
class Document:
|
||||
pos = 0
|
||||
doc = []
|
||||
index = {}
|
||||
|
||||
|
@ -26,7 +27,7 @@ class Document:
|
|||
if not idx:
|
||||
if default is not None:
|
||||
return default
|
||||
raise ValueError(f"Attribute of {name} does not exist in document")
|
||||
raise AttributeError(f"Attribute of {name} does not exist in document")
|
||||
if len(idx) == 1:
|
||||
return self.doc[idx[0]].value
|
||||
return [self.doc[v].value for v in idx]
|
||||
|
@ -40,8 +41,23 @@ class Document:
|
|||
self.doc[-1].value += value
|
||||
|
||||
def __iter__(self):
|
||||
self.pos = 0
|
||||
for item in self.doc:
|
||||
yield item
|
||||
self.pos += 1
|
||||
|
||||
def previous(self, match):
|
||||
if self.pos is 0:
|
||||
return None
|
||||
if self.doc[self.pos-1].token != match:
|
||||
return None
|
||||
return self.doc[self.pos-1]
|
||||
|
||||
def filter(self, value):
|
||||
"""Only return types that are of intrest like source blocks"""
|
||||
for item in self.doc:
|
||||
if item.token == value:
|
||||
yield item
|
||||
|
||||
def body(self):
|
||||
for item in self.doc:
|
||||
|
@ -49,6 +65,16 @@ class Document:
|
|||
continue
|
||||
yield item
|
||||
|
||||
def images(self):
|
||||
for item in self.__iter__():
|
||||
if item.token == 'IMG':
|
||||
yield item.value[0]
|
||||
if item.token == 'TEXT':
|
||||
if isinstance(item.value, list):
|
||||
for token in item.value:
|
||||
if token.token == 'IMG':
|
||||
yield token
|
||||
|
||||
def __len__(self):
|
||||
return len(self.doc)
|
||||
|
||||
|
@ -85,6 +111,68 @@ def parseline(text):
|
|||
return False, Token(token="TEXT", value=text + " ")
|
||||
|
||||
|
||||
def parse_text(txt):
|
||||
char = True
|
||||
tokens = []
|
||||
|
||||
def img(char, step):
|
||||
if char != '[':
|
||||
return char
|
||||
char = next(step, None)
|
||||
|
||||
if char != '[':
|
||||
return char
|
||||
char = next(step, None)
|
||||
|
||||
path = ''
|
||||
while char not in [']'] + ESCAPE:
|
||||
path += char
|
||||
char = next(step, None)
|
||||
char = next(step, None)
|
||||
|
||||
alt = ''
|
||||
if char == '[':
|
||||
char = next(step, None)
|
||||
while char not in [']'] + ESCAPE:
|
||||
alt += char
|
||||
char = next(step, None)
|
||||
char = next(step, None)
|
||||
|
||||
tokens.append(Token('IMG', [path, alt]))
|
||||
return ''
|
||||
|
||||
def emphasis(char, step, end='*', tag='b'):
|
||||
if not char or char!=end:
|
||||
return char
|
||||
char = next(step, None)
|
||||
r = ''
|
||||
while char and char not in [end] + ESCAPE:
|
||||
r += char
|
||||
char = next(step, None)
|
||||
tokens.append(Token('b', r))
|
||||
return ''
|
||||
|
||||
|
||||
step = iter(txt)
|
||||
while char is not None:
|
||||
char = next(step, None)
|
||||
char = emphasis(char, step, '*', 'b')
|
||||
char = emphasis(char, step, '/', 'i')
|
||||
char = emphasis(char, step, '_', 'u')
|
||||
char = emphasis(char, step, '=', 'v')
|
||||
char = emphasis(char, step, '~', 'pre')
|
||||
char = img(char, step)
|
||||
if not char:
|
||||
continue
|
||||
if len(tokens) == 0:
|
||||
tokens.append(Token('TEXT', char))
|
||||
continue
|
||||
if tokens[-1].token != 'TEXT':
|
||||
tokens.append(Token('TEXT', char))
|
||||
continue
|
||||
tokens[-1].value += char
|
||||
return tokens
|
||||
|
||||
def parse(stream):
|
||||
doc = Document()
|
||||
block = False
|
||||
|
@ -103,4 +191,9 @@ def parse(stream):
|
|||
doc.update(result[1].value)
|
||||
continue
|
||||
doc.append(result[1])
|
||||
|
||||
for item in doc.filter('TEXT'):
|
||||
#print('@@@@@@@@@@@@@@@@@')
|
||||
#print(item.value)
|
||||
item.value = parse_text(item.value)
|
||||
return doc
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
__version__ = 0.1
|
||||
__version__ = 0.2
|
||||
|
||||
|
|
|
@ -25,10 +25,10 @@ Simple raw html generation
|
|||
head -n 5 examples/html-plain/example.py
|
||||
#+END_SRC
|
||||
|
||||
|
||||
** Enaml web templating language
|
||||
Written mainly to try out enaml-web
|
||||
#+BEGIN_SRC sh :results output drawer
|
||||
head -n 5 examples/html-enaml/example.py
|
||||
#+END_SRC
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
|
||||
*bold*, /italic/, _underlined_, =verbatim= and ~code~, and, if you must, ‘+strike-through+’
|
|
@ -11,5 +11,5 @@ def test_fetch_attribute():
|
|||
def test_fetch_non_existant_attribute():
|
||||
with open(os.path.abspath("./tests/fixtures/test.org"), "r") as fp:
|
||||
doc = parse(fp)
|
||||
with pytest.raises(ValueError):
|
||||
with pytest.raises(AttributeError):
|
||||
doc.fake
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
import os
|
||||
import pytest
|
||||
from eorg.parser import Token
|
||||
from eorg.parser import parse
|
||||
from eorg.parser import parse_text
|
||||
|
||||
|
||||
def test_emphasis():
|
||||
text = "parse emphasis *bold text* _underlined text_ /italic text/ normal text"
|
||||
expected = [Token(token='TEXT', value='parse emphasis ' ), Token(token='b', value='bold text'), Token(token='TEXT', value=' ' ), Token(token='b', value='underlined text'), Token(token='TEXT', value=' ' ), Token(token='b', value='italic text'), Token('TEXT', ' normal text')]
|
||||
result = parse_text(text)
|
||||
assert expected[0].value == result[0].value
|
||||
assert expected[1].value == result[1].value
|
||||
assert expected[2].value == result[2].value
|
||||
assert expected[3].value == result[3].value
|
||||
assert expected[4].value == result[4].value
|
||||
assert expected[5].value == result[5].value
|
||||
assert expected[6].value == result[6].value
|
||||
|
||||
|
||||
def test_image():
|
||||
text = "parse image [[../../test.jpg][test]] after image"
|
||||
expected = [
|
||||
Token("TEXT", "parse image "),
|
||||
Token("IMG", ["../../test.jpg", "test"]),
|
||||
Token("TEXT", " after image"),
|
||||
]
|
||||
result = parse_text(text)
|
||||
assert result[0].value == expected[0].value
|
||||
assert result[1].value == expected[1].value
|
||||
assert result[2].value == expected[2].value
|
Loading…
Reference in New Issue