FRET-LibAFL/utils/gramatron/gnf_converter.py
2022-05-10 19:48:48 +02:00

312 lines
10 KiB
Python
Executable File

#!/usr/bin/env python3
# Originally from https://github.com/HexHive/Gramatron
# License: Apache-2
# Copyright 2021 HexHive
# Copyright 2021 AFLplusplus
import sys
import re
import copy
import json
from string import ascii_uppercase
from itertools import combinations
from collections import defaultdict
DEBUG = False
NONTERMINALSET = []
COUNT = 1
def convert_to_gnf(grammar, start):
if DEBUG:
with open('debug_preprocess.json', 'w+') as fd:
json.dump(grammar, fd)
grammar = remove_unit(grammar) # eliminates unit productions
if DEBUG:
with open('debug_unit.json', 'w+') as fd:
json.dump(grammar, fd)
grammar = remove_mixed(grammar) # eliminate terminals existing with non-terminals
if DEBUG:
with open('debug_mixed.json', 'w+') as fd:
json.dump(grammar, fd)
grammar = gnf(grammar)
# Dump GNF form of the grammar with only reachable rules
# reachable_grammar = get_reachable(grammar, start)
# with open('debug_gnf_reachable.json', 'w+') as fd:
# json.dump(reachable_grammar, fd)
if DEBUG:
with open('debug_gnf.json', 'w+') as fd:
json.dump(grammar, fd)
grammar["Start"] = [start]
return grammar
def remove_left_recursion(grammar):
# Remove the left recursion in the grammar rules.
# This algorithm is adopted from
# https://www.geeksforgeeks.org/introduction-of-parsing-ambiguity-and-parsers-set-1/
# Note that the current implementation does not
# guarantee completeness and will not remove recursions
# similar to { "A": ["BC"], "B": ["AD"] }.
# Therefore, we need to call this function each time
# the rule is updated.
old_grammar = copy.deepcopy(grammar)
new_grammar = defaultdict(list)
no_left_recursion = False
while not no_left_recursion:
for lhs, rules in old_grammar.items():
left_recursion = []
others = []
for rule in rules:
tokens = gettokens(rule)
if tokens[0] == lhs:
left_recursion.append(tokens)
else:
others.append(tokens)
if left_recursion:
new_rule = get_nonterminal()
for r in others:
r.append(new_rule)
left_recursion = [r[1:] + [new_rule] for r in left_recursion]
left_recursion.append(["' '"])
new_grammar[lhs] = [' '.join(rule) for rule in others]
new_grammar[new_rule] = [' '.join(rule) for rule in left_recursion]
else:
new_grammar[lhs] = [' '.join(rule) for rule in others]
no_left_recursion = True
for lhs, rules in old_grammar.items():
for rule in rules:
tokens = gettokens(rule)
if tokens[0] == lhs:
left_recursion = False
break
else:
continue
break
if not no_left_recursion:
old_grammar = copy.deepcopy(new_grammar)
new_grammar = defaultdict(list)
return new_grammar
def get_reachable(grammar, start):
'''
Returns a grammar without dead rules
'''
reachable_nt = set()
worklist = list()
processed = set()
reachable_grammar = dict()
worklist.append(start)
while worklist:
nt = worklist.pop(0)
processed.add(nt)
reachable_grammar[nt] = grammar[nt]
rules = grammar[nt]
for rule in rules:
tokens = gettokens(rule)
for token in tokens:
if not isTerminal(token):
if token not in processed:
worklist.append(token)
return reachable_grammar
def gettokens(rule):
pattern = re.compile("([^\s\"\']+)|\"([^\"]*)\"|\'([^\']*)\'")
return [matched.group(0) for matched in pattern.finditer(rule)]
def gnf(grammar):
old_grammar = copy.deepcopy(grammar)
new_grammar = defaultdict(list)
isgnf = False
while not isgnf:
old_grammar = remove_left_recursion(old_grammar)
for lhs, rules in old_grammar.items():
for rule in rules:
tokens = gettokens(rule)
if len(tokens) == 1 and isTerminal(rule):
new_grammar[lhs].append(rule)
continue
startoken = tokens[0]
assert(startoken != lhs)
endrule = tokens[1:]
if not isTerminal(startoken):
newrules = []
extendrules = old_grammar[startoken]
for extension in extendrules:
temprule = endrule[:]
temprule.insert(0, extension)
newrules.append(temprule)
for newnew in newrules:
new_grammar[lhs].append(' '.join(newnew))
else:
new_grammar[lhs].append(rule)
isgnf = True
for lhs, rules in new_grammar.items():
for rule in rules:
# if "\' \'" or isTerminal(rule):
tokens = gettokens(rule)
if len(tokens) == 1 and isTerminal(rule):
continue
startoken = tokens[0]
if not isTerminal(startoken):
isgnf = False
break
if not isgnf:
old_grammar = copy.deepcopy(new_grammar)
new_grammar = defaultdict(list)
return new_grammar
def process_antlr4_grammar(data):
productions = []
production = []
for line in data:
if line != '\n':
production.append(line)
else:
productions.append(production)
production = []
final_rule_set = {}
for production in productions:
rules = []
init = production[0]
nonterminal = init.split(':')[0]
rules.append(strip_chars(init.split(':')[1]).strip('| '))
for production_rule in production[1:]:
rules.append(strip_chars(production_rule.split('|')[0]))
final_rule_set[nonterminal] = rules
# for line in data:
# if line != '\n':
# production.append(line)
return final_rule_set
def remove_unit(grammar):
nounitproductions = False
old_grammar = copy.deepcopy(grammar)
new_grammar = defaultdict(list)
while not nounitproductions:
for lhs, rules in old_grammar.items():
for rhs in rules:
# Checking if the rule is a unit production rule
if len(gettokens(rhs)) == 1:
if not isTerminal(rhs):
new_grammar[lhs].extend([rule for rule in old_grammar[rhs]])
else:
new_grammar[lhs].append(rhs)
else:
new_grammar[lhs].append(rhs)
# Checking there are no unit productions left in the grammar
nounitproductions = True
for lhs, rules in new_grammar.items():
for rhs in rules:
if len(gettokens(rhs)) == 1:
if not isTerminal(rhs):
nounitproductions = False
break
if not nounitproductions:
break
# Unit productions are still there in the grammar -- repeat the process
if not nounitproductions:
old_grammar = copy.deepcopy(new_grammar)
new_grammar = defaultdict(list)
return new_grammar
def isTerminal(rule):
# pattern = re.compile("([r]*\'[\s\S]+\')")
pattern = re.compile("\'(.*?)\'")
match = pattern.match(rule)
if match:
return True
else:
return False
def remove_mixed(grammar):
'''
Remove rules where there are terminals mixed in with non-terminals
'''
new_grammar = defaultdict(list)
for lhs, rules in grammar.items():
for rhs in rules:
tokens = gettokens(rhs)
if len(tokens) == 1:
new_grammar[lhs].append(rhs)
continue
regen_rule = [tokens[0]]
for token in tokens[1:]:
# print(token, isTerminal(token), regen_rule)
# Identify if there is a terminal in the RHS
if isTerminal(token):
# Check if a corresponding nonterminal already exists
nonterminal = terminal_exist(token, new_grammar)
if nonterminal:
regen_rule.append(nonterminal)
else:
new_nonterm = get_nonterminal()
new_grammar[new_nonterm].append(token)
regen_rule.append(new_nonterm)
else:
regen_rule.append(token)
new_grammar[lhs].append(' '.join(regen_rule))
return new_grammar
def strip_chars(rule):
return rule.strip('\n\t ')
def get_nonterminal():
global COUNT
COUNT += 1
return f"GeneratedTermVar{COUNT}"
def terminal_exist(token, grammar):
for nonterminal, rules in grammar.items():
if token in rules and len(token) == 1:
return nonterminal
return None
def main(grammar_file, out, start):
grammar = None
# If grammar file is a preprocessed NT file, then skip preprocessing
if '.json' in grammar_file:
with open(grammar_file, 'r') as fd:
grammar = json.load(fd)
elif '.g4' in grammar_file:
with open(grammar_file, 'r') as fd:
data = fd.readlines()
grammar = process_antlr4_grammar(data)
else:
raise('Unknwown file format passed. Accepts (.g4/.json)')
grammar = convert_to_gnf(grammar, start)
with open(out, 'w+') as fd:
json.dump(grammar, fd)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description = 'Script to convert grammar to GNF form')
parser.add_argument(
'--gf',
type = str,
required = True,
help = 'Location of grammar file')
parser.add_argument(
'--out',
type = str,
required = True,
help = 'Location of output file')
parser.add_argument(
'--start',
type = str,
required = True,
help = 'Start token')
parser.add_argument(
'--debug',
action='store_true',
help = 'Write intermediate states to debug files')
args = parser.parse_args()
DEBUG = args.debug
main(args.gf, args.out, args.start)