
* No Usermode default * no tui * gg * try fix CI * fmt --------- Co-authored-by: Dominik Maier <dmnk@google.com>
372 lines
12 KiB
Python
372 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
# Originally from https://github.com/HexHive/Gramatron
|
|
# License: Apache-2
|
|
# Copyright 2021 HexHive
|
|
# Copyright 2021 AFLplusplus
|
|
|
|
import sys
|
|
import json
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
# import pygraphviz as pgv
|
|
|
|
gram_data = None
|
|
state_count = 1
|
|
pda = []
|
|
worklist = []
|
|
state_stacks = {}
|
|
|
|
# === If user provides upper bound on the stack size during FSA creation ===
|
|
# Specifies the upper bound to which the stack is allowed to grow
|
|
# If for any generated state, the stack size is >= stack_limit then this
|
|
# state is not expanded further.
|
|
stack_limit = None
|
|
# Holds the set of unexpanded rules owing to the user-passed stack constraint limit
|
|
unexpanded_rules = set()
|
|
|
|
|
|
def main(grammar, limit):
|
|
global worklist, gram_data, stack_limit
|
|
current = "0"
|
|
stack_limit = limit
|
|
if stack_limit:
|
|
print("[X] Operating in bounded stack mode")
|
|
|
|
with open(grammar, "r") as fd:
|
|
gram_data = json.load(fd)
|
|
start_symbol = gram_data["Start"][0]
|
|
worklist.append([current, [start_symbol]])
|
|
# print (grammar)
|
|
filename = (grammar.split("/")[-1]).split(".")[0]
|
|
|
|
while worklist:
|
|
# Take an element from the worklist
|
|
# print ('================')
|
|
# print ('Worklist:', worklist)
|
|
element = worklist.pop(0)
|
|
prep_transitions(element)
|
|
|
|
pda_file = filename + "_transition.json"
|
|
graph_file = filename + ".png"
|
|
# print ('XXXXXXXXXXXXXXXX')
|
|
# print ('PDA file:%s Png graph file:%s' % (pda_file, graph_file))
|
|
# XXX Commented out because visualization of current version of PHP causes segfault
|
|
# Create the graph and dump the transitions to a file
|
|
# create_graph(filename)
|
|
transformed = postprocess()
|
|
with open(filename + "_automata.json", "w+") as fd:
|
|
json.dump(transformed, fd)
|
|
with open(filename + "_transition.json", "w+") as fd:
|
|
json.dump(pda, fd)
|
|
if not unexpanded_rules:
|
|
print("[X] No unexpanded rules, absolute FSA formed")
|
|
exit(0)
|
|
else:
|
|
print(
|
|
"[X] Certain rules were not expanded due to stack size limit. Inexact approximation has been created and the disallowed rules have been put in {}_disallowed.json".format(
|
|
filename
|
|
)
|
|
)
|
|
print("[X] Number of unexpanded rules:", len(unexpanded_rules))
|
|
with open(filename + "_disallowed.json", "w+") as fd:
|
|
json.dump(list(unexpanded_rules), fd)
|
|
|
|
|
|
def create_graph(filename):
|
|
"""
|
|
Creates a DOT representation of the PDA
|
|
"""
|
|
global pda
|
|
G = pgv.AGraph(strict=False, directed=True)
|
|
for transition in pda:
|
|
print("Transition:", transition)
|
|
G.add_edge(
|
|
transition["source"],
|
|
transition["dest"],
|
|
label="Term:{}".format(transition["terminal"]),
|
|
)
|
|
G.layout(prog="dot")
|
|
print("Do it up 2")
|
|
G.draw(filename + ".png")
|
|
|
|
|
|
def prep_transitions(element):
|
|
"""
|
|
Generates transitions
|
|
"""
|
|
global gram_data, state_count, pda, worklist, state_stacks, stack_limit, unexpanded_rules
|
|
state = element[0]
|
|
try:
|
|
nonterminal = element[1][0]
|
|
except IndexError:
|
|
# Final state was encountered, pop from worklist without doing anything
|
|
return
|
|
rules = gram_data[nonterminal]
|
|
count = 1
|
|
for rule in rules:
|
|
isRecursive = False
|
|
# print ('Current state:', state)
|
|
terminal, ss, termIsRegex = tokenize(rule)
|
|
transition = get_template()
|
|
transition["trigger"] = "_".join([state, str(count)])
|
|
transition["source"] = state
|
|
transition["dest"] = str(state_count)
|
|
transition["ss"] = ss
|
|
transition["terminal"] = terminal
|
|
transition["rule"] = "{} -> {}".format(nonterminal, rule)
|
|
if termIsRegex:
|
|
transition["termIsRegex"] = True
|
|
|
|
# Creating a state stack for the new state
|
|
try:
|
|
state_stack = state_stacks[state][:]
|
|
except:
|
|
state_stack = []
|
|
if len(state_stack):
|
|
state_stack.pop(0)
|
|
if ss:
|
|
for symbol in ss[::-1]:
|
|
state_stack.insert(0, symbol)
|
|
transition["stack"] = state_stack
|
|
|
|
# Check if a recursive transition state being created, if so make a backward
|
|
# edge and don't add anything to the worklist
|
|
# print (state_stacks)
|
|
if state_stacks:
|
|
for state_element, stack in state_stacks.items():
|
|
# print ('Stack:', sorted(stack))
|
|
# print ('State stack:', sorted(state_stack))
|
|
if sorted(stack) == sorted(state_stack):
|
|
transition["dest"] = state_element
|
|
# print ('Recursive:', transition)
|
|
pda.append(transition)
|
|
count += 1
|
|
isRecursive = True
|
|
break
|
|
# If a recursive transition exercised don't add the same transition as a new
|
|
# edge, continue onto the next transitions
|
|
if isRecursive:
|
|
continue
|
|
|
|
# If the generated state has a stack size > stack_limit then that state is abandoned
|
|
# and not added to the FSA or the worklist for further expansion
|
|
if stack_limit:
|
|
if len(transition["stack"]) > stack_limit:
|
|
unexpanded_rules.add(transition["rule"])
|
|
continue
|
|
|
|
# Create transitions for the non-recursive relations and add to the worklist
|
|
# print ('Normal:', transition)
|
|
# print ('State2:', state)
|
|
pda.append(transition)
|
|
worklist.append([transition["dest"], transition["stack"]])
|
|
state_stacks[transition["dest"]] = state_stack
|
|
state_count += 1
|
|
count += 1
|
|
|
|
|
|
def tokenize(rule):
|
|
"""
|
|
Gets the terminal and the corresponding stack symbols from a rule in GNF form
|
|
"""
|
|
pattern = re.compile("([r])*'([\s\S]+)'([\s\S]*)")
|
|
terminal = None
|
|
ss = None
|
|
termIsRegex = False
|
|
match = pattern.match(rule)
|
|
if match.group(1):
|
|
termIsRegex = True
|
|
if match.group(2):
|
|
terminal = match.group(2)
|
|
else:
|
|
raise AssertionError("Rule is not in GNF form")
|
|
|
|
if match.group(3):
|
|
ss = (match.group(3)).split()
|
|
|
|
return terminal, ss, termIsRegex
|
|
|
|
|
|
def get_template():
|
|
transition_template = {
|
|
"trigger": None,
|
|
"source": None,
|
|
"dest": None,
|
|
"termIsRegex": False,
|
|
"terminal": None,
|
|
"stack": [],
|
|
}
|
|
return transition_template
|
|
|
|
|
|
def postprocess1():
|
|
"""
|
|
Creates a representation to be passed on to the C-module
|
|
"""
|
|
global pda
|
|
final_struct = {}
|
|
# Supporting data structures for if stack limit is imposed
|
|
culled_pda = []
|
|
culled_final = []
|
|
num_transitions = 0 # Keep track of number of transitions
|
|
|
|
states, final, initial = _get_states()
|
|
memoized = [[]] * len(states)
|
|
|
|
print(initial)
|
|
assert len(initial) == 1, "More than one init state found"
|
|
|
|
# Cull transitions to states which were not expanded owing to the stack limit
|
|
if stack_limit:
|
|
blocklist = []
|
|
for final_state in final:
|
|
for transition in pda:
|
|
if (transition["dest"] == final_state) and (
|
|
len(transition["stack"]) > 0
|
|
):
|
|
blocklist.append(transition["dest"])
|
|
continue
|
|
else:
|
|
culled_pda.append(transition)
|
|
|
|
culled_final = [state for state in final if state not in blocklist]
|
|
|
|
assert len(culled_final) == 1, "More than one final state found"
|
|
|
|
for transition in culled_pda:
|
|
state = transition["source"]
|
|
if transition["dest"] in blocklist:
|
|
continue
|
|
num_transitions += 1
|
|
memoized[int(state)].append(
|
|
(transition["trigger"], int(transition["dest"]), transition["terminal"])
|
|
)
|
|
final_struct["init_state"] = int(initial)
|
|
final_struct["final_state"] = int(culled_final[0])
|
|
# The reason we do this is because when states are culled, the indexing is
|
|
# still relative to the actual number of states hence we keep numstates recorded
|
|
# as the original number of states
|
|
print("[X] Actual Number of states:", len(memoized))
|
|
print("[X] Number of transitions:", num_transitions)
|
|
print("[X] Original Number of states:", len(states))
|
|
final_struct["pda"] = memoized
|
|
return final_struct
|
|
|
|
# Running FSA construction in exact approximation mode and postprocessing it like so
|
|
for transition in pda:
|
|
state = transition["source"]
|
|
memoized[int(state)].append(
|
|
(transition["trigger"], int(transition["dest"]), transition["terminal"])
|
|
)
|
|
|
|
final_struct["init_state"] = int(initial)
|
|
final_struct["final_state"] = int(final[0])
|
|
print("[X] Actual Number of states:", len(memoized))
|
|
final_struct["pda"] = memoized
|
|
return final_struct
|
|
|
|
|
|
def postprocess():
|
|
"""
|
|
Creates a representation to be passed on to the C-module
|
|
"""
|
|
global pda
|
|
final_struct = {}
|
|
memoized = defaultdict(list)
|
|
# Supporting data structures for if stack limit is imposed
|
|
culled_pda = []
|
|
culled_final = []
|
|
num_transitions = 0 # Keep track of number of transitions
|
|
|
|
states, final, initial = _get_states()
|
|
|
|
print(initial)
|
|
assert len(initial) == 1, "More than one init state found"
|
|
|
|
# Cull transitions to states which were not expanded owing to the stack limit
|
|
if stack_limit:
|
|
blocklist = []
|
|
for final_state in final:
|
|
for transition in pda:
|
|
if (transition["dest"] == final_state) and (
|
|
len(transition["stack"]) > 0
|
|
):
|
|
blocklist.append(transition["dest"])
|
|
continue
|
|
else:
|
|
culled_pda.append(transition)
|
|
|
|
culled_final = [state for state in final if state not in blocklist]
|
|
|
|
assert len(culled_final) == 1, "More than one final state found"
|
|
|
|
for transition in culled_pda:
|
|
state = transition["source"]
|
|
if transition["dest"] in blocklist:
|
|
continue
|
|
num_transitions += 1
|
|
memoized[int(state)].append(
|
|
[transition["trigger"], int(transition["dest"]), transition["terminal"]]
|
|
)
|
|
|
|
final_struct["init_state"] = int(initial)
|
|
final_struct["final_state"] = int(culled_final[0])
|
|
# The reason we do this is because when states are culled, the indexing is
|
|
# still relative to the actual number of states hence we keep numstates recorded
|
|
# as the original number of states
|
|
print("[X] Actual Number of states:", len(memoized.keys()))
|
|
print("[X] Number of transitions:", num_transitions)
|
|
print("[X] Original Number of states:", len(states))
|
|
# final_struct["numstates"] = len(states)
|
|
memoized_list = [[]] * len(states)
|
|
else:
|
|
# Running FSA construction in exact approximation mode and postprocessing it like so
|
|
for transition in pda:
|
|
state = transition["source"]
|
|
memoized[int(state)].append(
|
|
[transition["trigger"], int(transition["dest"]), transition["terminal"]]
|
|
)
|
|
|
|
final_struct["init_state"] = int(initial)
|
|
final_struct["final_state"] = int(final[0])
|
|
print("[X] Actual Number of states:", len(memoized.keys()))
|
|
# final_struct["numstates"] = len(memoized.keys())
|
|
memoized_list = [[]] * len(memoized.keys())
|
|
|
|
for k in memoized.keys():
|
|
memoized_list[k] = memoized[k]
|
|
final_struct["pda"] = memoized_list
|
|
return final_struct
|
|
|
|
|
|
def _get_states():
|
|
source = set()
|
|
dest = set()
|
|
global pda
|
|
for transition in pda:
|
|
source.add(transition["source"])
|
|
dest.add(transition["dest"])
|
|
source_copy = source.copy()
|
|
source_copy.update(dest)
|
|
return (
|
|
list(source_copy),
|
|
list(dest.difference(source)),
|
|
str("".join(list(source.difference(dest)))),
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Script to convert GNF grammar to PDA")
|
|
parser.add_argument("--gf", type=str, help="Location of GNF grammar")
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
default=None,
|
|
help="Specify the upper bound for the stack size",
|
|
)
|
|
args = parser.parse_args()
|
|
main(args.gf, args.limit)
|