FRET-LibAFL/utils/gramatron/gnf_converter.py
Valentin Huber e32b3eae93
Introducing Launcher::overcommit, improving CI formatting (#2670)
* introducing Launcher::overcommit

* removing unnecessary cfg restrictions and clippy allows

* improving warning for wrong clang-format version

* installing black in the format CI

* Enforcing python formatting in CI

* extending formatting using black on all python files

* printing diff on black failure

* preferring python's black over system black

* moving to LLVM 19 for formatting
2024-11-09 19:13:51 +01:00

317 lines
10 KiB
Python
Executable File

#!/usr/bin/env python3
# Originally from https://github.com/HexHive/Gramatron
# License: Apache-2
# Copyright 2021 HexHive
# Copyright 2021 AFLplusplus
import sys
import re
import copy
import json
from string import ascii_uppercase
from itertools import combinations
from collections import defaultdict
DEBUG = False
NONTERMINALSET = []
COUNT = 1
def convert_to_gnf(grammar, start):
if DEBUG:
with open("debug_preprocess.json", "w+") as fd:
json.dump(grammar, fd)
grammar = remove_unit(grammar) # eliminates unit productions
if DEBUG:
with open("debug_unit.json", "w+") as fd:
json.dump(grammar, fd)
grammar = remove_mixed(grammar) # eliminate terminals existing with non-terminals
if DEBUG:
with open("debug_mixed.json", "w+") as fd:
json.dump(grammar, fd)
grammar = gnf(grammar)
# Dump GNF form of the grammar with only reachable rules
# reachable_grammar = get_reachable(grammar, start)
# with open('debug_gnf_reachable.json', 'w+') as fd:
# json.dump(reachable_grammar, fd)
if DEBUG:
with open("debug_gnf.json", "w+") as fd:
json.dump(grammar, fd)
grammar["Start"] = [start]
return grammar
def remove_left_recursion(grammar):
# Remove the left recursion in the grammar rules.
# This algorithm is adopted from
# https://www.geeksforgeeks.org/introduction-of-parsing-ambiguity-and-parsers-set-1/
# Note that the current implementation does not
# guarantee completeness and will not remove recursions
# similar to { "A": ["BC"], "B": ["AD"] }.
# Therefore, we need to call this function each time
# the rule is updated.
old_grammar = copy.deepcopy(grammar)
new_grammar = defaultdict(list)
no_left_recursion = False
while not no_left_recursion:
for lhs, rules in old_grammar.items():
left_recursion = []
others = []
for rule in rules:
tokens = gettokens(rule)
if tokens[0] == lhs:
left_recursion.append(tokens)
else:
others.append(tokens)
if left_recursion:
new_rule = get_nonterminal()
for r in others:
r.append(new_rule)
left_recursion = [r[1:] + [new_rule] for r in left_recursion]
left_recursion.append(["' '"])
new_grammar[lhs] = [" ".join(rule) for rule in others]
new_grammar[new_rule] = [" ".join(rule) for rule in left_recursion]
else:
new_grammar[lhs] = [" ".join(rule) for rule in others]
no_left_recursion = True
for lhs, rules in old_grammar.items():
for rule in rules:
tokens = gettokens(rule)
if tokens[0] == lhs:
no_left_recursion = False
break
else:
continue
break
if not no_left_recursion:
old_grammar = copy.deepcopy(new_grammar)
new_grammar = defaultdict(list)
return new_grammar
def get_reachable(grammar, start):
"""
Returns a grammar without dead rules
"""
reachable_nt = set()
worklist = list()
processed = set()
reachable_grammar = dict()
worklist.append(start)
while worklist:
nt = worklist.pop(0)
processed.add(nt)
reachable_grammar[nt] = grammar[nt]
rules = grammar[nt]
for rule in rules:
tokens = gettokens(rule)
for token in tokens:
if not isTerminal(token):
if token not in processed:
worklist.append(token)
return reachable_grammar
def gettokens(rule):
pattern = re.compile("([^\s\"']+)|\"([^\"]*)\"|'([^']*)'")
return [matched.group(0) for matched in pattern.finditer(rule)]
def gnf(grammar):
old_grammar = copy.deepcopy(grammar)
new_grammar = defaultdict(list)
isgnf = False
while not isgnf:
old_grammar = remove_left_recursion(old_grammar)
for lhs, rules in old_grammar.items():
for rule in rules:
tokens = gettokens(rule)
if len(tokens) == 1 and isTerminal(rule):
new_grammar[lhs].append(rule)
continue
startoken = tokens[0]
assert startoken != lhs
endrule = tokens[1:]
if not isTerminal(startoken):
newrules = []
extendrules = old_grammar[startoken]
for extension in extendrules:
temprule = endrule[:]
temprule.insert(0, extension)
newrules.append(temprule)
for newnew in newrules:
new_grammar[lhs].append(" ".join(newnew))
else:
new_grammar[lhs].append(rule)
isgnf = True
for lhs, rules in new_grammar.items():
for rule in rules:
# if "\' \'" or isTerminal(rule):
tokens = gettokens(rule)
if len(tokens) == 1 and isTerminal(rule):
continue
startoken = tokens[0]
if not isTerminal(startoken):
isgnf = False
break
if not isgnf:
old_grammar = copy.deepcopy(new_grammar)
new_grammar = defaultdict(list)
return new_grammar
def process_antlr4_grammar(data):
productions = []
production = []
for line in data:
if line != "\n":
production.append(line)
else:
productions.append(production)
production = []
final_rule_set = {}
for production in productions:
rules = []
init = production[0]
nonterminal = init.split(":")[0]
rules.append(strip_chars(init.split(":")[1]).strip("| "))
for production_rule in production[1:]:
rules.append(strip_chars(production_rule.split("|")[0]))
final_rule_set[nonterminal] = rules
# for line in data:
# if line != '\n':
# production.append(line)
return final_rule_set
def remove_unit(grammar):
nounitproductions = False
old_grammar = copy.deepcopy(grammar)
new_grammar = defaultdict(list)
while not nounitproductions:
for lhs, rules in old_grammar.items():
for rhs in rules:
# Checking if the rule is a unit production rule
if len(gettokens(rhs)) == 1:
if not isTerminal(rhs):
new_grammar[lhs].extend([rule for rule in old_grammar[rhs]])
else:
new_grammar[lhs].append(rhs)
else:
new_grammar[lhs].append(rhs)
# Checking there are no unit productions left in the grammar
nounitproductions = True
for lhs, rules in new_grammar.items():
for rhs in rules:
if len(gettokens(rhs)) == 1:
if not isTerminal(rhs):
nounitproductions = False
break
if not nounitproductions:
break
# Unit productions are still there in the grammar -- repeat the process
if not nounitproductions:
old_grammar = copy.deepcopy(new_grammar)
new_grammar = defaultdict(list)
return new_grammar
def isTerminal(rule):
# pattern = re.compile("([r]*\'[\s\S]+\')")
pattern = re.compile("'(.*?)'")
match = pattern.match(rule)
if match:
return True
else:
return False
def remove_mixed(grammar):
"""
Remove rules where there are terminals mixed in with non-terminals
"""
new_grammar = defaultdict(list)
for lhs, rules in grammar.items():
for rhs in rules:
tokens = gettokens(rhs)
if len(tokens) == 1:
new_grammar[lhs].append(rhs)
continue
regen_rule = [tokens[0]]
for token in tokens[1:]:
# print(token, isTerminal(token), regen_rule)
# Identify if there is a terminal in the RHS
if isTerminal(token):
# Check if a corresponding nonterminal already exists
nonterminal = terminal_exist(token, new_grammar)
if nonterminal:
regen_rule.append(nonterminal)
else:
new_nonterm = get_nonterminal()
new_grammar[new_nonterm].append(token)
regen_rule.append(new_nonterm)
else:
regen_rule.append(token)
new_grammar[lhs].append(" ".join(regen_rule))
return new_grammar
def strip_chars(rule):
return rule.strip("\n\t ")
def get_nonterminal():
global COUNT
COUNT += 1
return f"GeneratedTermVar{COUNT}"
def terminal_exist(token, grammar):
for nonterminal, rules in grammar.items():
if token in rules and len(token) == 1:
return nonterminal
return None
def main(grammar_file, out, start):
grammar = None
# If grammar file is a preprocessed NT file, then skip preprocessing
if ".json" in grammar_file:
with open(grammar_file, "r") as fd:
grammar = json.load(fd)
elif ".g4" in grammar_file:
with open(grammar_file, "r") as fd:
data = fd.readlines()
grammar = process_antlr4_grammar(data)
else:
raise ("Unknwown file format passed. Accepts (.g4/.json)")
grammar = convert_to_gnf(grammar, start)
with open(out, "w+") as fd:
json.dump(grammar, fd)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Script to convert grammar to GNF form"
)
parser.add_argument(
"--gf", type=str, required=True, help="Location of grammar file"
)
parser.add_argument(
"--out", type=str, required=True, help="Location of output file"
)
parser.add_argument("--start", type=str, required=True, help="Start token")
parser.add_argument(
"--debug", action="store_true", help="Write intermediate states to debug files"
)
args = parser.parse_args()
DEBUG = args.debug
main(args.gf, args.out, args.start)