312 lines
10 KiB
Python
Executable File
312 lines
10 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# Originally from https://github.com/HexHive/Gramatron
|
|
# License: Apache-2
|
|
# Copyright 2021 HexHive
|
|
# Copyright 2021 AFLplusplus
|
|
|
|
import sys
|
|
import re
|
|
import copy
|
|
import json
|
|
from string import ascii_uppercase
|
|
from itertools import combinations
|
|
from collections import defaultdict
|
|
|
|
DEBUG = False
|
|
NONTERMINALSET = []
|
|
COUNT = 1
|
|
|
|
def convert_to_gnf(grammar, start):
|
|
if DEBUG:
|
|
with open('debug_preprocess.json', 'w+') as fd:
|
|
json.dump(grammar, fd)
|
|
grammar = remove_unit(grammar) # eliminates unit productions
|
|
if DEBUG:
|
|
with open('debug_unit.json', 'w+') as fd:
|
|
json.dump(grammar, fd)
|
|
grammar = remove_mixed(grammar) # eliminate terminals existing with non-terminals
|
|
if DEBUG:
|
|
with open('debug_mixed.json', 'w+') as fd:
|
|
json.dump(grammar, fd)
|
|
grammar = gnf(grammar)
|
|
|
|
# Dump GNF form of the grammar with only reachable rules
|
|
# reachable_grammar = get_reachable(grammar, start)
|
|
# with open('debug_gnf_reachable.json', 'w+') as fd:
|
|
# json.dump(reachable_grammar, fd)
|
|
if DEBUG:
|
|
with open('debug_gnf.json', 'w+') as fd:
|
|
json.dump(grammar, fd)
|
|
|
|
grammar["Start"] = [start]
|
|
return grammar
|
|
|
|
def remove_left_recursion(grammar):
|
|
# Remove the left recursion in the grammar rules.
|
|
# This algorithm is adopted from
|
|
# https://www.geeksforgeeks.org/introduction-of-parsing-ambiguity-and-parsers-set-1/
|
|
# Note that the current implementation does not
|
|
# guarantee completeness and will not remove recursions
|
|
# similar to { "A": ["BC"], "B": ["AD"] }.
|
|
# Therefore, we need to call this function each time
|
|
# the rule is updated.
|
|
old_grammar = copy.deepcopy(grammar)
|
|
new_grammar = defaultdict(list)
|
|
no_left_recursion = False
|
|
while not no_left_recursion:
|
|
for lhs, rules in old_grammar.items():
|
|
left_recursion = []
|
|
others = []
|
|
for rule in rules:
|
|
tokens = gettokens(rule)
|
|
if tokens[0] == lhs:
|
|
left_recursion.append(tokens)
|
|
else:
|
|
others.append(tokens)
|
|
if left_recursion:
|
|
new_rule = get_nonterminal()
|
|
for r in others:
|
|
r.append(new_rule)
|
|
left_recursion = [r[1:] + [new_rule] for r in left_recursion]
|
|
left_recursion.append(["' '"])
|
|
new_grammar[lhs] = [' '.join(rule) for rule in others]
|
|
new_grammar[new_rule] = [' '.join(rule) for rule in left_recursion]
|
|
else:
|
|
new_grammar[lhs] = [' '.join(rule) for rule in others]
|
|
no_left_recursion = True
|
|
for lhs, rules in old_grammar.items():
|
|
for rule in rules:
|
|
tokens = gettokens(rule)
|
|
if tokens[0] == lhs:
|
|
left_recursion = False
|
|
break
|
|
else:
|
|
continue
|
|
break
|
|
if not no_left_recursion:
|
|
old_grammar = copy.deepcopy(new_grammar)
|
|
new_grammar = defaultdict(list)
|
|
return new_grammar
|
|
|
|
def get_reachable(grammar, start):
|
|
'''
|
|
Returns a grammar without dead rules
|
|
'''
|
|
reachable_nt = set()
|
|
worklist = list()
|
|
processed = set()
|
|
reachable_grammar = dict()
|
|
worklist.append(start)
|
|
|
|
while worklist:
|
|
nt = worklist.pop(0)
|
|
processed.add(nt)
|
|
reachable_grammar[nt] = grammar[nt]
|
|
rules = grammar[nt]
|
|
for rule in rules:
|
|
tokens = gettokens(rule)
|
|
for token in tokens:
|
|
if not isTerminal(token):
|
|
if token not in processed:
|
|
worklist.append(token)
|
|
return reachable_grammar
|
|
|
|
|
|
def gettokens(rule):
|
|
pattern = re.compile("([^\s\"\']+)|\"([^\"]*)\"|\'([^\']*)\'")
|
|
return [matched.group(0) for matched in pattern.finditer(rule)]
|
|
|
|
def gnf(grammar):
|
|
old_grammar = copy.deepcopy(grammar)
|
|
new_grammar = defaultdict(list)
|
|
isgnf = False
|
|
while not isgnf:
|
|
old_grammar = remove_left_recursion(old_grammar)
|
|
for lhs, rules in old_grammar.items():
|
|
for rule in rules:
|
|
tokens = gettokens(rule)
|
|
if len(tokens) == 1 and isTerminal(rule):
|
|
new_grammar[lhs].append(rule)
|
|
continue
|
|
startoken = tokens[0]
|
|
assert(startoken != lhs)
|
|
endrule = tokens[1:]
|
|
if not isTerminal(startoken):
|
|
newrules = []
|
|
extendrules = old_grammar[startoken]
|
|
for extension in extendrules:
|
|
temprule = endrule[:]
|
|
temprule.insert(0, extension)
|
|
newrules.append(temprule)
|
|
for newnew in newrules:
|
|
new_grammar[lhs].append(' '.join(newnew))
|
|
else:
|
|
new_grammar[lhs].append(rule)
|
|
isgnf = True
|
|
for lhs, rules in new_grammar.items():
|
|
for rule in rules:
|
|
# if "\' \'" or isTerminal(rule):
|
|
tokens = gettokens(rule)
|
|
if len(tokens) == 1 and isTerminal(rule):
|
|
continue
|
|
startoken = tokens[0]
|
|
if not isTerminal(startoken):
|
|
isgnf = False
|
|
break
|
|
if not isgnf:
|
|
old_grammar = copy.deepcopy(new_grammar)
|
|
new_grammar = defaultdict(list)
|
|
return new_grammar
|
|
|
|
|
|
def process_antlr4_grammar(data):
|
|
productions = []
|
|
production = []
|
|
for line in data:
|
|
if line != '\n':
|
|
production.append(line)
|
|
else:
|
|
productions.append(production)
|
|
production = []
|
|
final_rule_set = {}
|
|
for production in productions:
|
|
rules = []
|
|
init = production[0]
|
|
nonterminal = init.split(':')[0]
|
|
rules.append(strip_chars(init.split(':')[1]).strip('| '))
|
|
for production_rule in production[1:]:
|
|
rules.append(strip_chars(production_rule.split('|')[0]))
|
|
final_rule_set[nonterminal] = rules
|
|
# for line in data:
|
|
# if line != '\n':
|
|
# production.append(line)
|
|
return final_rule_set
|
|
|
|
def remove_unit(grammar):
|
|
nounitproductions = False
|
|
old_grammar = copy.deepcopy(grammar)
|
|
new_grammar = defaultdict(list)
|
|
while not nounitproductions:
|
|
for lhs, rules in old_grammar.items():
|
|
for rhs in rules:
|
|
# Checking if the rule is a unit production rule
|
|
if len(gettokens(rhs)) == 1:
|
|
if not isTerminal(rhs):
|
|
new_grammar[lhs].extend([rule for rule in old_grammar[rhs]])
|
|
else:
|
|
new_grammar[lhs].append(rhs)
|
|
else:
|
|
new_grammar[lhs].append(rhs)
|
|
# Checking there are no unit productions left in the grammar
|
|
nounitproductions = True
|
|
for lhs, rules in new_grammar.items():
|
|
for rhs in rules:
|
|
if len(gettokens(rhs)) == 1:
|
|
if not isTerminal(rhs):
|
|
nounitproductions = False
|
|
break
|
|
if not nounitproductions:
|
|
break
|
|
# Unit productions are still there in the grammar -- repeat the process
|
|
if not nounitproductions:
|
|
old_grammar = copy.deepcopy(new_grammar)
|
|
new_grammar = defaultdict(list)
|
|
return new_grammar
|
|
|
|
def isTerminal(rule):
|
|
# pattern = re.compile("([r]*\'[\s\S]+\')")
|
|
pattern = re.compile("\'(.*?)\'")
|
|
match = pattern.match(rule)
|
|
if match:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def remove_mixed(grammar):
|
|
'''
|
|
Remove rules where there are terminals mixed in with non-terminals
|
|
'''
|
|
new_grammar = defaultdict(list)
|
|
for lhs, rules in grammar.items():
|
|
for rhs in rules:
|
|
tokens = gettokens(rhs)
|
|
if len(tokens) == 1:
|
|
new_grammar[lhs].append(rhs)
|
|
continue
|
|
regen_rule = [tokens[0]]
|
|
for token in tokens[1:]:
|
|
# print(token, isTerminal(token), regen_rule)
|
|
# Identify if there is a terminal in the RHS
|
|
if isTerminal(token):
|
|
# Check if a corresponding nonterminal already exists
|
|
nonterminal = terminal_exist(token, new_grammar)
|
|
if nonterminal:
|
|
regen_rule.append(nonterminal)
|
|
else:
|
|
new_nonterm = get_nonterminal()
|
|
new_grammar[new_nonterm].append(token)
|
|
regen_rule.append(new_nonterm)
|
|
else:
|
|
regen_rule.append(token)
|
|
new_grammar[lhs].append(' '.join(regen_rule))
|
|
return new_grammar
|
|
|
|
def strip_chars(rule):
|
|
return rule.strip('\n\t ')
|
|
|
|
def get_nonterminal():
|
|
global COUNT
|
|
COUNT += 1
|
|
return f"GeneratedTermVar{COUNT}"
|
|
|
|
def terminal_exist(token, grammar):
|
|
for nonterminal, rules in grammar.items():
|
|
if token in rules and len(token) == 1:
|
|
return nonterminal
|
|
return None
|
|
|
|
|
|
def main(grammar_file, out, start):
|
|
grammar = None
|
|
# If grammar file is a preprocessed NT file, then skip preprocessing
|
|
if '.json' in grammar_file:
|
|
with open(grammar_file, 'r') as fd:
|
|
grammar = json.load(fd)
|
|
elif '.g4' in grammar_file:
|
|
with open(grammar_file, 'r') as fd:
|
|
data = fd.readlines()
|
|
grammar = process_antlr4_grammar(data)
|
|
else:
|
|
raise('Unknwown file format passed. Accepts (.g4/.json)')
|
|
|
|
grammar = convert_to_gnf(grammar, start)
|
|
with open(out, 'w+') as fd:
|
|
json.dump(grammar, fd)
|
|
|
|
if __name__ == '__main__':
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description = 'Script to convert grammar to GNF form')
|
|
parser.add_argument(
|
|
'--gf',
|
|
type = str,
|
|
required = True,
|
|
help = 'Location of grammar file')
|
|
parser.add_argument(
|
|
'--out',
|
|
type = str,
|
|
required = True,
|
|
help = 'Location of output file')
|
|
parser.add_argument(
|
|
'--start',
|
|
type = str,
|
|
required = True,
|
|
help = 'Start token')
|
|
parser.add_argument(
|
|
'--debug',
|
|
action='store_true',
|
|
help = 'Write intermediate states to debug files')
|
|
args = parser.parse_args()
|
|
DEBUG = args.debug
|
|
|
|
main(args.gf, args.out, args.start)
|