markov/Generate_Model.py

import json
import argparse
import os

def get_token(file):
    punct_tokens = '",.!?;:-_'
    word = ""
    for line in file:
        for c in line:
            if c.isspace():
                if word != "": yield word
                word = ""
                continue
            if c in punct_tokens:
                yield word
                yield c
                word = ""
            else:
                word += c

def update_model(model, file):
    token_stream = get_token(file)
    former = next(token_stream)
    current = next(token_stream)
    while True:
        try:
            prior = next(token_stream)
        except:
            break
        if former not in model:
            model[former] = { "_count": 1 }
        else:
            model[former]["_count"] += 1
        if current not in model[former]:
            model[former][current] = { "_count": 1 }
        else:
            model[former][current]["_count"] += 1
        if prior not in model[former][current]:
            model[former][current][prior] = { "_count": 1 }
        else:
            model[former][current][prior]["_count"] += 1
        former = current
        current = prior

def main(args):
    if args.model:
        with open(args.model) as m_file: model = json.load(m_file)
    else:
        model = {}
    if os.path.isdir(args.input):
        files = os.listdir(args.input)
        files = list(map(lambda d: f"{args.input}/{d}", files))
    else:
        files = [args.input]
    for f_name in files:
        with open(f_name, 'r', encoding='utf-8') as file: update_model(model, file)
    with open(args.out, "w") as m_file: json.dump(model, m_file)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('input')
    parser.add_argument('-m', '--model', default='')
    parser.add_argument('-o', '--out', default='out.json')
    main(parser.parse_args())