initial

2026-02-24 21:15:27 -05:00
commit 59c1d76a4b
16 changed files with 111808 additions and 0 deletions
--- a/Generate_Model.py
+++ b/Generate_Model.py
@@ -0,0 +1,65 @@
+import json
+import argparse
+import os
+
+def get_token(file):
+    punct_tokens = '",.!?;:-_'
+    word = ""
+    for line in file:
+        for c in line:
+            if c.isspace():
+                if word != "": yield word
+                word = ""
+                continue
+            if c in punct_tokens:
+                yield word
+                yield c
+                word = ""
+            else:
+                word += c
+
+def update_model(model, file):
+    token_stream = get_token(file)
+    former = next(token_stream)
+    current = next(token_stream)
+    while True:
+        try:
+            prior = next(token_stream)
+        except:
+            break
+        if former not in model:
+            model[former] = { "_count": 1 }
+        else:
+            model[former]["_count"] += 1
+        if current not in model[former]:
+            model[former][current] = { "_count": 1 }
+        else:
+            model[former][current]["_count"] += 1
+        if prior not in model[former][current]:
+            model[former][current][prior] = { "_count": 1 }
+        else:
+            model[former][current][prior]["_count"] += 1
+        former = current
+        current = prior
+
+def main(args):
+    if args.model:
+        with open(args.model) as m_file: model = json.load(m_file)
+    else:
+        model = {}
+    if os.path.isdir(args.input):
+        files = os.listdir(args.input)
+        files = list(map(lambda d: f"{args.input}/{d}", files))
+    else:
+        files = [args.input]
+    for f_name in files:
+        with open(f_name, 'r', encoding='utf-8') as file: update_model(model, file)
+    with open(args.out, "w") as m_file: json.dump(model, m_file)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('input')
+    parser.add_argument('-m', '--model', default='')
+    parser.add_argument('-o', '--out', default='out.json')
+    main(parser.parse_args())
--- a/Markov.py
+++ b/Markov.py
@@ -0,0 +1,41 @@
+import random
+import json
+import argparse
+
+def select(model: dict):
+    m_filtered = list(filter(lambda i: i[0] != "_count", model.items()))
+    return random.choices(
+        list(map(lambda i: i[0], m_filtered)),
+        list(map(lambda i: i[1]["_count"], m_filtered)),
+        k=1
+    )[0]
+
+def generate_token(model, former, current):
+    p = random.random()
+    if p <= 0.90:
+        try:
+            return select(model[former][current])
+        except: pass
+    if p <= 0.98:
+        try:
+            return select(model[current])
+        except: pass
+    return select(model)
+
+def main(args):
+    with open(args.model) as m_file: model = json.load(m_file)
+    former = select(model)
+    current = select(model[former])
+    print(f"{former} {current} ", end="")
+    for _ in range(args.size - 2):
+        prior = generate_token(model, former, current)
+        print(prior + " ", end="")
+        former = current
+        current = prior
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model')
+    parser.add_argument('-s', '--size', default=100, type=int)
+    main(parser.parse_args())
--- a/helpy.py
+++ b/helpy.py
@@ -0,0 +1,13 @@
+import wx
+
+class MainWindow(wx.Frame):
+    def __init__(self, parent, title):
+        wx.Frame.__init__(self, parent, title=title, size=(200,100))
+        self.control = wx.TextCtrl(self, style=wx.TE_MULTILINE)
+        self.CreateStatusBar() # A Statusbar in the bottom of the window
+
+        self.Show(True)
+
+app = wx.App(False)
+frame = MainWindow(None, "Sample editor")
+app.MainLoop()
--- a/out.json
+++ b/out.json
--- a/potter_sample.txt
+++ b/potter_sample.txt
--- a/simple.txt
+++ b/simple.txt
--- a/test.json
+++ b/test.json
@@ -0,0 +1 @@
+{"Hello": {"_count": 3, ",": {"_count": 2, "this": {"_count": 2}}, "this": {"_count": 1, "is": {"_count": 1}}}, ",": {"_count": 3, "this": {"_count": 3, "is": {"_count": 3}}}, "this": {"_count": 4, "is": {"_count": 4, "Liz": {"_count": 3}, ",": {"_count": 1}}}, "is": {"_count": 5, "Liz": {"_count": 3, "Miller": {"_count": 3}}, "a": {"_count": 1, "small": {"_count": 1}}, ",": {"_count": 1, "this": {"_count": 1}}}, "Liz": {"_count": 3, "Miller": {"_count": 3, ".": {"_count": 2}, "This": {"_count": 1}}}, "Miller": {"_count": 3, ".": {"_count": 2, "This": {"_count": 1}, "Hello": {"_count": 1}}, "This": {"_count": 1, "is": {"_count": 1}}}, ".": {"_count": 3, "This": {"_count": 1, "is": {"_count": 1}}, "Hello": {"_count": 2, "this": {"_count": 1}, ",": {"_count": 1}}}, "This": {"_count": 2, "is": {"_count": 2, "a": {"_count": 2}}}, "a": {"_count": 2, "small": {"_count": 1, "test": {"_count": 1}}, "trigram": {"_count": 1, "markov": {"_count": 1}}}, "small": {"_count": 1, "test": {"_count": 1, "text": {"_count": 1}}}, "test": {"_count": 1, "text": {"_count": 1, "from": {"_count": 1}}}, "text": {"_count": 1, "from": {"_count": 1, "which": {"_count": 1}}}, "from": {"_count": 1, "which": {"_count": 1, "to": {"_count": 1}}}, "which": {"_count": 1, "to": {"_count": 1, "generate": {"_count": 1}}}, "to": {"_count": 1, "generate": {"_count": 1, "a": {"_count": 1}}}, "generate": {"_count": 1, "a": {"_count": 1, "trigram": {"_count": 1}}}, "trigram": {"_count": 1, "markov": {"_count": 1, "language": {"_count": 1}}}, "markov": {"_count": 1, "language": {"_count": 1, "model": {"_count": 1}}}, "language": {"_count": 1, "model": {"_count": 1, ".": {"_count": 1}}}, "model": {"_count": 1, ".": {"_count": 1, "Hello": {"_count": 1}}}}
--- a/test.txt
+++ b/test.txt
@@ -0,0 +1,2 @@
+Hello, this is Liz Miller. This is a small test text from which to generate a trigram markov language model.
+Hello this is, this is Liz Miller. Hello, this is Liz Miller This is a small
--- a/text/1.txt
+++ b/text/1.txt
--- a/text/2.txt
+++ b/text/2.txt
--- a/text/3.txt
+++ b/text/3.txt
--- a/text/4.txt
+++ b/text/4.txt
--- a/text/5.txt
+++ b/text/5.txt
--- a/text/Harry_Potter_all_books_preprocessed.txt
+++ b/text/Harry_Potter_all_books_preprocessed.txt
--- a/text/frank.txt
+++ b/text/frank.txt
--- a/text/ulysses.txt
+++ b/text/ulysses.txt
				`@@ -0,0 +1 @@`
				{"Hello": {"_count": 3, ",": {"_count": 2, "this": {"_count": 2}}, "this": {"_count": 1, "is": {"_count": 1}}}, ",": {"_count": 3, "this": {"_count": 3, "is": {"_count": 3}}}, "this": {"_count": 4, "is": {"_count": 4, "Liz": {"_count": 3}, ",": {"_count": 1}}}, "is": {"_count": 5, "Liz": {"_count": 3, "Miller": {"_count": 3}}, "a": {"_count": 1, "small": {"_count": 1}}, ",": {"_count": 1, "this": {"_count": 1}}}, "Liz": {"_count": 3, "Miller": {"_count": 3, ".": {"_count": 2}, "This": {"_count": 1}}}, "Miller": {"_count": 3, ".": {"_count": 2, "This": {"_count": 1}, "Hello": {"_count": 1}}, "This": {"_count": 1, "is": {"_count": 1}}}, ".": {"_count": 3, "This": {"_count": 1, "is": {"_count": 1}}, "Hello": {"_count": 2, "this": {"_count": 1}, ",": {"_count": 1}}}, "This": {"_count": 2, "is": {"_count": 2, "a": {"_count": 2}}}, "a": {"_count": 2, "small": {"_count": 1, "test": {"_count": 1}}, "trigram": {"_count": 1, "markov": {"_count": 1}}}, "small": {"_count": 1, "test": {"_count": 1, "text": {"_count": 1}}}, "test": {"_count": 1, "text": {"_count": 1, "from": {"_count": 1}}}, "text": {"_count": 1, "from": {"_count": 1, "which": {"_count": 1}}}, "from": {"_count": 1, "which": {"_count": 1, "to": {"_count": 1}}}, "which": {"_count": 1, "to": {"_count": 1, "generate": {"_count": 1}}}, "to": {"_count": 1, "generate": {"_count": 1, "a": {"_count": 1}}}, "generate": {"_count": 1, "a": {"_count": 1, "trigram": {"_count": 1}}}, "trigram": {"_count": 1, "markov": {"_count": 1, "language": {"_count": 1}}}, "markov": {"_count": 1, "language": {"_count": 1, "model": {"_count": 1}}}, "language": {"_count": 1, "model": {"_count": 1, ".": {"_count": 1}}}, "model": {"_count": 1, ".": {"_count": 1, "Hello": {"_count": 1}}}}