Files
markov/Generate_Model.py
2026-02-24 21:15:27 -05:00

65 lines
1.9 KiB
Python

import json
import argparse
import os
def get_token(file):
punct_tokens = '",.!?;:-_'
word = ""
for line in file:
for c in line:
if c.isspace():
if word != "": yield word
word = ""
continue
if c in punct_tokens:
yield word
yield c
word = ""
else:
word += c
def update_model(model, file):
token_stream = get_token(file)
former = next(token_stream)
current = next(token_stream)
while True:
try:
prior = next(token_stream)
except:
break
if former not in model:
model[former] = { "_count": 1 }
else:
model[former]["_count"] += 1
if current not in model[former]:
model[former][current] = { "_count": 1 }
else:
model[former][current]["_count"] += 1
if prior not in model[former][current]:
model[former][current][prior] = { "_count": 1 }
else:
model[former][current][prior]["_count"] += 1
former = current
current = prior
def main(args):
if args.model:
with open(args.model) as m_file: model = json.load(m_file)
else:
model = {}
if os.path.isdir(args.input):
files = os.listdir(args.input)
files = list(map(lambda d: f"{args.input}/{d}", files))
else:
files = [args.input]
for f_name in files:
with open(f_name, 'r', encoding='utf-8') as file: update_model(model, file)
with open(args.out, "w") as m_file: json.dump(model, m_file)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('input')
parser.add_argument('-m', '--model', default='')
parser.add_argument('-o', '--out', default='out.json')
main(parser.parse_args())