eorg/eorg/parser.py

206 lines
5.7 KiB
Python

import re
from eorg import tokens
from eorg.tokens import Token
from eorg.const import TOKENS, METADATA, ESCAPE, image_extensions
class Document:
pos = 0
doc = []
index = {}
def __init__(self):
self.doc = []
self.index = {}
def __getattr__(self, name, default=None):
idx = self.index.get(name.upper(), [])
if not idx:
if default is not None:
return default
raise AttributeError(f"Attribute of {name} does not exist in document")
if len(idx) == 1:
return self.doc[idx[0]].value
return [self.doc[v].value for v in idx]
def token(self):
if self.doc:
return self.doc[-1].token
return ""
def update(self, value):
self.doc[-1].value += value
def __iter__(self):
self.pos = 0
for item in self.doc:
yield item
self.pos += 1
def previous(self, match):
if self.pos is 0:
return None
if self.doc[self.pos-1].token != match:
return None
return self.doc[self.pos-1]
def filter(self, value):
"""Only return types that are of intrest like source blocks"""
for item in self.doc:
if item.token == value:
yield item
def body(self):
for item in self.doc:
if item.token in METADATA:
continue
yield item
def images(self):
for item in self.__iter__():
if item.token == tokens.IMAGE:
yield item.value[0]
if item.token == tokens.TEXT:
if isinstance(item.value, list):
for token in item.value:
if token.token == tokens.IMAGE:
yield token
def __len__(self):
return len(self.doc)
def append(self, value):
self.index.setdefault(value.token, []).append(len(self.doc))
self.doc.append(value)
def parse_attrs(text):
attrs = {}
value_list = text.split(':')
attrs['language'] = value_list.pop(0).strip()
for row in value_list:
values = row.strip().split(' ')
attrs[values[0]] = values[1:]
return attrs
def parsebody(text, rx):
match = re.search(rx, text)
if match:
return False, None
return rx, text + "\n"
def parseline(text):
attrs=None
for key, (rx, block, s, e, count) in TOKENS.items():
match = re.search(rx, text)
if not match:
continue
value = text[match.end() :]
level = len(match.group(0))
if count is True:
attrs={'depth': level}
if key == tokens.META:
return (
block,
Token(token=match.group(0)[s:e], value=value),
)
if key == tokens.SOURCE:
return block, Token(token=key, attrs=parse_attrs(value))
if key == tokens.TABLE:
return block, Token(token=key, value=text+"\n")
if key == tokens.BULLET:
return block, Token(token=key, value=text+"\n")
return block, Token(token=key, value=value, attrs=attrs)
text = text.strip()
if text == "":
return False, Token(token=tokens.BLANK, value=text)
return False, Token(token=tokens.LIST, value=text + " ")
def parse_text(txt):
char = True
tokenlist = []
def img(char, step):
if char != '[':
return char
char = next(step, None)
if char != '[':
return char
char = next(step, None)
path = ''
while char not in [']'] + ESCAPE:
path += char
char = next(step, None)
char = next(step, None)
alt = ''
if char == '[':
char = next(step, None)
while char not in [']'] + ESCAPE:
alt += char
char = next(step, None)
char = next(step, None)
if path.endswith(image_extensions):
tokenlist.append(Token(tokens.IMAGE, [path, alt]))
return ''
tokenlist.append(Token(tokens.LINK, [path, alt]))
return ''
def emphasis(char, step, end, tag):
if not char or char!=end:
return char
char = next(step, None)
r = ''
while char and char not in [end] + ESCAPE:
r += char
char = next(step, None)
tokenlist.append(Token(tag, r))
return ''
step = iter(txt)
while char is not None:
char = next(step, None)
char = emphasis(char, step, '*', tokens.BOLD)
char = emphasis(char, step, '/', tokens.ITALIC)
char = emphasis(char, step, '_', tokens.UNDERLINED)
char = emphasis(char, step, '=', tokens.VERBATIM)
char = emphasis(char, step, '~', 'PRE')
char = img(char, step)
if not char:
continue
if len(tokenlist) == 0:
tokenlist.append(Token(tokens.TEXT, char))
continue
if tokenlist[-1].token != tokens.TEXT:
tokenlist.append(Token(tokens.TEXT, char))
continue
tokenlist[-1].value += char
return tokenlist
def parse(stream):
doc = Document()
block = False
for line in stream:
line = line.strip('\n')
if block is not False:
block, token = parsebody(line, block)
if block:
doc.update(token)
continue
block, token = parseline(line)
if token:
if doc.token() == tokens.LIST and token.token == tokens.LIST:
doc.update(token.value)
continue
doc.append(token)
for item in doc.filter(tokens.LIST):
item.value = parse_text(item.value)
return doc