initial
This commit is contained in:
65
Generate_Model.py
Normal file
65
Generate_Model.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
def get_token(file):
|
||||||
|
punct_tokens = '",.!?;:-_'
|
||||||
|
word = ""
|
||||||
|
for line in file:
|
||||||
|
for c in line:
|
||||||
|
if c.isspace():
|
||||||
|
if word != "": yield word
|
||||||
|
word = ""
|
||||||
|
continue
|
||||||
|
if c in punct_tokens:
|
||||||
|
yield word
|
||||||
|
yield c
|
||||||
|
word = ""
|
||||||
|
else:
|
||||||
|
word += c
|
||||||
|
|
||||||
|
def update_model(model, file):
|
||||||
|
token_stream = get_token(file)
|
||||||
|
former = next(token_stream)
|
||||||
|
current = next(token_stream)
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
prior = next(token_stream)
|
||||||
|
except:
|
||||||
|
break
|
||||||
|
if former not in model:
|
||||||
|
model[former] = { "_count": 1 }
|
||||||
|
else:
|
||||||
|
model[former]["_count"] += 1
|
||||||
|
if current not in model[former]:
|
||||||
|
model[former][current] = { "_count": 1 }
|
||||||
|
else:
|
||||||
|
model[former][current]["_count"] += 1
|
||||||
|
if prior not in model[former][current]:
|
||||||
|
model[former][current][prior] = { "_count": 1 }
|
||||||
|
else:
|
||||||
|
model[former][current][prior]["_count"] += 1
|
||||||
|
former = current
|
||||||
|
current = prior
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
if args.model:
|
||||||
|
with open(args.model) as m_file: model = json.load(m_file)
|
||||||
|
else:
|
||||||
|
model = {}
|
||||||
|
if os.path.isdir(args.input):
|
||||||
|
files = os.listdir(args.input)
|
||||||
|
files = list(map(lambda d: f"{args.input}/{d}", files))
|
||||||
|
else:
|
||||||
|
files = [args.input]
|
||||||
|
for f_name in files:
|
||||||
|
with open(f_name, 'r', encoding='utf-8') as file: update_model(model, file)
|
||||||
|
with open(args.out, "w") as m_file: json.dump(model, m_file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('input')
|
||||||
|
parser.add_argument('-m', '--model', default='')
|
||||||
|
parser.add_argument('-o', '--out', default='out.json')
|
||||||
|
main(parser.parse_args())
|
||||||
41
Markov.py
Normal file
41
Markov.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
import random
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
def select(model: dict):
|
||||||
|
m_filtered = list(filter(lambda i: i[0] != "_count", model.items()))
|
||||||
|
return random.choices(
|
||||||
|
list(map(lambda i: i[0], m_filtered)),
|
||||||
|
list(map(lambda i: i[1]["_count"], m_filtered)),
|
||||||
|
k=1
|
||||||
|
)[0]
|
||||||
|
|
||||||
|
def generate_token(model, former, current):
|
||||||
|
p = random.random()
|
||||||
|
if p <= 0.90:
|
||||||
|
try:
|
||||||
|
return select(model[former][current])
|
||||||
|
except: pass
|
||||||
|
if p <= 0.98:
|
||||||
|
try:
|
||||||
|
return select(model[current])
|
||||||
|
except: pass
|
||||||
|
return select(model)
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
with open(args.model) as m_file: model = json.load(m_file)
|
||||||
|
former = select(model)
|
||||||
|
current = select(model[former])
|
||||||
|
print(f"{former} {current} ", end="")
|
||||||
|
for _ in range(args.size - 2):
|
||||||
|
prior = generate_token(model, former, current)
|
||||||
|
print(prior + " ", end="")
|
||||||
|
former = current
|
||||||
|
current = prior
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('model')
|
||||||
|
parser.add_argument('-s', '--size', default=100, type=int)
|
||||||
|
main(parser.parse_args())
|
||||||
13
helpy.py
Normal file
13
helpy.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
import wx
|
||||||
|
|
||||||
|
class MainWindow(wx.Frame):
|
||||||
|
def __init__(self, parent, title):
|
||||||
|
wx.Frame.__init__(self, parent, title=title, size=(200,100))
|
||||||
|
self.control = wx.TextCtrl(self, style=wx.TE_MULTILINE)
|
||||||
|
self.CreateStatusBar() # A Statusbar in the bottom of the window
|
||||||
|
|
||||||
|
self.Show(True)
|
||||||
|
|
||||||
|
app = wx.App(False)
|
||||||
|
frame = MainWindow(None, "Sample editor")
|
||||||
|
app.MainLoop()
|
||||||
BIN
potter_sample.txt
Normal file
BIN
potter_sample.txt
Normal file
Binary file not shown.
0
simple.txt
Normal file
0
simple.txt
Normal file
1
test.json
Normal file
1
test.json
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"Hello": {"_count": 3, ",": {"_count": 2, "this": {"_count": 2}}, "this": {"_count": 1, "is": {"_count": 1}}}, ",": {"_count": 3, "this": {"_count": 3, "is": {"_count": 3}}}, "this": {"_count": 4, "is": {"_count": 4, "Liz": {"_count": 3}, ",": {"_count": 1}}}, "is": {"_count": 5, "Liz": {"_count": 3, "Miller": {"_count": 3}}, "a": {"_count": 1, "small": {"_count": 1}}, ",": {"_count": 1, "this": {"_count": 1}}}, "Liz": {"_count": 3, "Miller": {"_count": 3, ".": {"_count": 2}, "This": {"_count": 1}}}, "Miller": {"_count": 3, ".": {"_count": 2, "This": {"_count": 1}, "Hello": {"_count": 1}}, "This": {"_count": 1, "is": {"_count": 1}}}, ".": {"_count": 3, "This": {"_count": 1, "is": {"_count": 1}}, "Hello": {"_count": 2, "this": {"_count": 1}, ",": {"_count": 1}}}, "This": {"_count": 2, "is": {"_count": 2, "a": {"_count": 2}}}, "a": {"_count": 2, "small": {"_count": 1, "test": {"_count": 1}}, "trigram": {"_count": 1, "markov": {"_count": 1}}}, "small": {"_count": 1, "test": {"_count": 1, "text": {"_count": 1}}}, "test": {"_count": 1, "text": {"_count": 1, "from": {"_count": 1}}}, "text": {"_count": 1, "from": {"_count": 1, "which": {"_count": 1}}}, "from": {"_count": 1, "which": {"_count": 1, "to": {"_count": 1}}}, "which": {"_count": 1, "to": {"_count": 1, "generate": {"_count": 1}}}, "to": {"_count": 1, "generate": {"_count": 1, "a": {"_count": 1}}}, "generate": {"_count": 1, "a": {"_count": 1, "trigram": {"_count": 1}}}, "trigram": {"_count": 1, "markov": {"_count": 1, "language": {"_count": 1}}}, "markov": {"_count": 1, "language": {"_count": 1, "model": {"_count": 1}}}, "language": {"_count": 1, "model": {"_count": 1, ".": {"_count": 1}}}, "model": {"_count": 1, ".": {"_count": 1, "Hello": {"_count": 1}}}}
|
||||||
2
test.txt
Normal file
2
test.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
Hello, this is Liz Miller. This is a small test text from which to generate a trigram markov language model.
|
||||||
|
Hello this is, this is Liz Miller. Hello, this is Liz Miller This is a small
|
||||||
22748
text/1.txt
Normal file
22748
text/1.txt
Normal file
File diff suppressed because it is too large
Load Diff
9106
text/2.txt
Normal file
9106
text/2.txt
Normal file
File diff suppressed because it is too large
Load Diff
4265
text/3.txt
Normal file
4265
text/3.txt
Normal file
File diff suppressed because it is too large
Load Diff
22310
text/4.txt
Normal file
22310
text/4.txt
Normal file
File diff suppressed because it is too large
Load Diff
12724
text/5.txt
Normal file
12724
text/5.txt
Normal file
File diff suppressed because it is too large
Load Diff
1
text/Harry_Potter_all_books_preprocessed.txt
Normal file
1
text/Harry_Potter_all_books_preprocessed.txt
Normal file
File diff suppressed because one or more lines are too long
7317
text/frank.txt
Normal file
7317
text/frank.txt
Normal file
File diff suppressed because it is too large
Load Diff
33214
text/ulysses.txt
Normal file
33214
text/ulysses.txt
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user