initial
This commit is contained in:
65
Generate_Model.py
Normal file
65
Generate_Model.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
|
||||
def get_token(file):
|
||||
punct_tokens = '",.!?;:-_'
|
||||
word = ""
|
||||
for line in file:
|
||||
for c in line:
|
||||
if c.isspace():
|
||||
if word != "": yield word
|
||||
word = ""
|
||||
continue
|
||||
if c in punct_tokens:
|
||||
yield word
|
||||
yield c
|
||||
word = ""
|
||||
else:
|
||||
word += c
|
||||
|
||||
def update_model(model, file):
|
||||
token_stream = get_token(file)
|
||||
former = next(token_stream)
|
||||
current = next(token_stream)
|
||||
while True:
|
||||
try:
|
||||
prior = next(token_stream)
|
||||
except:
|
||||
break
|
||||
if former not in model:
|
||||
model[former] = { "_count": 1 }
|
||||
else:
|
||||
model[former]["_count"] += 1
|
||||
if current not in model[former]:
|
||||
model[former][current] = { "_count": 1 }
|
||||
else:
|
||||
model[former][current]["_count"] += 1
|
||||
if prior not in model[former][current]:
|
||||
model[former][current][prior] = { "_count": 1 }
|
||||
else:
|
||||
model[former][current][prior]["_count"] += 1
|
||||
former = current
|
||||
current = prior
|
||||
|
||||
def main(args):
|
||||
if args.model:
|
||||
with open(args.model) as m_file: model = json.load(m_file)
|
||||
else:
|
||||
model = {}
|
||||
if os.path.isdir(args.input):
|
||||
files = os.listdir(args.input)
|
||||
files = list(map(lambda d: f"{args.input}/{d}", files))
|
||||
else:
|
||||
files = [args.input]
|
||||
for f_name in files:
|
||||
with open(f_name, 'r', encoding='utf-8') as file: update_model(model, file)
|
||||
with open(args.out, "w") as m_file: json.dump(model, m_file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('input')
|
||||
parser.add_argument('-m', '--model', default='')
|
||||
parser.add_argument('-o', '--out', default='out.json')
|
||||
main(parser.parse_args())
|
||||
Reference in New Issue
Block a user