import json import argparse import os def get_token(file): punct_tokens = '",.!?;:-_' word = "" for line in file: for c in line: if c.isspace(): if word != "": yield word word = "" continue if c in punct_tokens: yield word yield c word = "" else: word += c def update_model(model, file): token_stream = get_token(file) former = next(token_stream) current = next(token_stream) while True: try: prior = next(token_stream) except: break if former not in model: model[former] = { "_count": 1 } else: model[former]["_count"] += 1 if current not in model[former]: model[former][current] = { "_count": 1 } else: model[former][current]["_count"] += 1 if prior not in model[former][current]: model[former][current][prior] = { "_count": 1 } else: model[former][current][prior]["_count"] += 1 former = current current = prior def main(args): if args.model: with open(args.model) as m_file: model = json.load(m_file) else: model = {} if os.path.isdir(args.input): files = os.listdir(args.input) files = list(map(lambda d: f"{args.input}/{d}", files)) else: files = [args.input] for f_name in files: with open(f_name, 'r', encoding='utf-8') as file: update_model(model, file) with open(args.out, "w") as m_file: json.dump(model, m_file) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('input') parser.add_argument('-m', '--model', default='') parser.add_argument('-o', '--out', default='out.json') main(parser.parse_args())