2025-07-01

2026-03-17 14:30:01 -06:00
parent f9a22056dd
commit 62b5978595
4579 changed files with 1257472 additions and 0 deletions
@@ -0,0 +1,121 @@
+'''
+Copyright (C) 2023 CG Cookie
+http://cgcookie.com
+hello@cgcookie.com
+
+Created by Jonathan Denning, Jonathan Williamson
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+'''
+
+import re
+
+# markdown line (first line only, ex: table)
+line_tests = {
+    'h1':     re.compile(r'(?<!#)# +(?P<text>.+)'),
+    'h2':     re.compile(r'(?<!#)## +(?P<text>.+)'),
+    'h3':     re.compile(r'(?<!#)### +(?P<text>.+)'),
+    'ul':     re.compile(r'(?P<indent> *)- +(?P<text>.+)'),
+    'ol':     re.compile(r'(?P<indent> *)\d+\. +(?P<text>.+)'),
+    'img':    re.compile(r'!\[(?P<caption>[^\]]*)\]\((?P<filename>[^) ]+)(?P<style>[^)]*)\)'),
+    'table':  re.compile(r'\| +(([^|]*?) +\|)+'),
+}
+
+# markdown inline
+inline_tests = {
+    'br':       re.compile(r'<br */?> *'),
+    'img':      re.compile(r'!\[(?P<caption>[^\]]*)\]\((?P<filename>[^) ]+)(?P<style>[^)]*)\)'),
+    'bold':     re.compile(r'\*(?P<text>.+?)\*'),
+    'code':     re.compile(r'`(?P<text>[^`]+)`'),
+    'link':     re.compile(r'\[(?P<text>.+?)\]\((?P<link>.+?)\)'),
+    'italic':   re.compile(r'_(?P<text>.+?)_'),
+    'html':     re.compile(r'''<((?P<tagname>[a-zA-Z]+)(?P<params>( +(?P<key>[a-zA-Z_]+(=(?P<val>"[^"]*"|'[^']*'|[^"' >]+))?)))*)(>(?P<contents>.*?)(?P<closetag></\2>)|(?P<selfclose> +/>))'''),
+    # 'checkbox': re.compile(r'<input (?P<params>.*?type="checkbox".*?)>(?P<innertext>.*?)<\/input>'),
+    # 'number':   re.compile(r'<input (?P<params>.*?type="number".*?)>'),
+    # 'button':   re.compile(r'<button(?P<params>[^>]*)>(?P<innertext>.*?)<\/button>'),
+    # 'progress': re.compile(r'<progress(?P<params>.*?)(>(?P<innertext>.*?)<\/progress>| \/>)'),
+
+    # https://www.toptal.com/designers/htmlarrows/arrows/
+    'arrow':    re.compile(r'&(?P<dir>uarr|darr|larr|rarr|harr|varr|uArr|dArr|lArr|rArr|hArr|vArr); *'),
+}
+
+# process markdown text similarly to Markdown
+preprocessing = [
+    (r'<!--.*?-->', r''),       # remove comments
+    (r'^\n*',       r''),       # remove leading \n
+    (r'\n*$',       r''),       # remove trailing \n
+    (r'\n\n\n*',    r'\n\n'),   # 2+ \n => \n\n
+    (r'---',        r'—'),      # em dash
+    (r'(?<!-)--',   r'–'),      # en dash
+]
+
+# https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
+re_url = re.compile(r'^((https?)|mailto)://([-a-zA-Z0-9@:%._\+~#=]+\.)*?[-a-zA-Z0-9@:%._+~#=]+\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)$')
+re_html_char = re.compile(r'(?P<pre>[^ ]*?)(?P<code>&([a-zA-Z]+|#x?[0-9A-Fa-f]+);)(?P<post>.*)')
+re_embedded_code = re.compile(r'(?P<pre>[^ `]+)(?P<code>`[^`]*`)(?P<post>.*)')
+
+class Markdown:
+    @staticmethod
+    def preprocess(txt):
+        for m,r in preprocessing:
+            txt = re.sub(m, r, txt)
+        return txt
+
+    @staticmethod
+    def is_url(txt): return re_url.match(txt) is not None
+
+    @staticmethod
+    def match_inline(line):
+        #line = line.lstrip()    # ignore leading spaces
+        for (t,r) in inline_tests.items():
+            m = r.match(line)
+            if m: return (t, m)
+        return (None, None)
+
+    @staticmethod
+    def match_line(line):
+        line = line.rstrip()    # ignore trailing spaces
+        for (t,r) in line_tests.items():
+            m = r.match(line)
+            if m: return (t, m)
+        return (None, None)
+
+    @staticmethod
+    def split_word(line, allow_empty_pre=False):
+        # search for html characters, like &nbsp;
+        m = re_html_char.match(line)
+        if m:
+            pr = m.group('pre')
+            co = m.group('code')
+            po = m.group('post')
+            if co == '&nbsp;':
+                # &nbsp; must get handled specially later!
+                # for now, consider &nbsp; part of the pre
+                npr,npo = Markdown.split_word(po, allow_empty_pre=True)
+                return (f'{pr}{co}{npr}', npo)
+            if pr or allow_empty_pre:
+                return (pr, f'{co}{po}')
+            return (co, po)
+        # search for embedded code in word, like (`-`)
+        m = re_embedded_code.match(line)
+        if m:
+            pr = m.group('pre')
+            co = m.group('code')
+            po = m.group('post')
+            return (pr, f'{co}{po}')
+        if ' ' not in line:
+            return (line,'')
+        i = line.index(' ') + 1
+        return (line[:i],line[i:])
+