FRET-LibAFL/utils/gramatron/gnf_converter.py

#!/usr/bin/env python3
# Originally from https://github.com/HexHive/Gramatron
# License: Apache-2
# Copyright 2021 HexHive
# Copyright 2021 AFLplusplus

import sys
import re
import copy
import json
from string import ascii_uppercase
from itertools import combinations
from collections import defaultdict

DEBUG = False
NONTERMINALSET = []
COUNT = 1

def convert_to_gnf(grammar, start):
    if DEBUG:
        with open('debug_preprocess.json', 'w+') as fd:
            json.dump(grammar, fd)
    grammar = remove_unit(grammar) # eliminates unit productions
    if DEBUG:
        with open('debug_unit.json', 'w+') as fd:
            json.dump(grammar, fd)
    grammar = remove_mixed(grammar) # eliminate terminals existing with non-terminals
    if DEBUG:
        with open('debug_mixed.json', 'w+') as fd:
            json.dump(grammar, fd)
    grammar = gnf(grammar)

    # Dump GNF form of the grammar with only reachable rules
    # reachable_grammar = get_reachable(grammar, start)
    # with open('debug_gnf_reachable.json', 'w+') as fd:
    #     json.dump(reachable_grammar, fd)
    if DEBUG:
        with open('debug_gnf.json', 'w+') as fd:
            json.dump(grammar, fd)

    grammar["Start"] = [start]
    return grammar

def remove_left_recursion(grammar):
    # Remove the left recursion in the grammar rules.
    # This algorithm is adopted from
    # https://www.geeksforgeeks.org/introduction-of-parsing-ambiguity-and-parsers-set-1/
    # Note that the current implementation does not
    # guarantee completeness and will not remove recursions
    # similar to { "A": ["BC"], "B": ["AD"] }.
    # Therefore, we need to call this function each time
    # the rule is updated.
    old_grammar = copy.deepcopy(grammar)
    new_grammar = defaultdict(list)
    no_left_recursion = False
    while not no_left_recursion:
        for lhs, rules in old_grammar.items():
            left_recursion = []
            others = []
            for rule in rules:
                tokens = gettokens(rule)
                if tokens[0] == lhs:
                    left_recursion.append(tokens)
                else:
                    others.append(tokens)
            if left_recursion:
                new_rule = get_nonterminal()
                for r in others:
                    r.append(new_rule)
                left_recursion = [r[1:] + [new_rule] for r in left_recursion]
                left_recursion.append(["' '"])
                new_grammar[lhs] = [' '.join(rule) for rule in others]
                new_grammar[new_rule] = [' '.join(rule) for rule in left_recursion]
            else:
                new_grammar[lhs] = [' '.join(rule) for rule in others]
        no_left_recursion = True
        for lhs, rules in old_grammar.items():
            for rule in rules:
                tokens = gettokens(rule)
                if tokens[0] == lhs:
                    left_recursion = False
                    break
            else:
                continue
            break
        if not no_left_recursion:
            old_grammar = copy.deepcopy(new_grammar)
            new_grammar = defaultdict(list)
    return new_grammar

def get_reachable(grammar, start):
    '''
    Returns a grammar without dead rules
    '''
    reachable_nt = set()
    worklist = list()
    processed = set()
    reachable_grammar = dict()
    worklist.append(start)

    while worklist:
        nt = worklist.pop(0)
        processed.add(nt)
        reachable_grammar[nt] = grammar[nt]
        rules = grammar[nt]
        for rule in rules:
            tokens = gettokens(rule)
            for token in tokens:
                if not isTerminal(token):
                    if token not in processed:
                        worklist.append(token)
    return reachable_grammar


def gettokens(rule):
    pattern = re.compile("([^\s\"\']+)|\"([^\"]*)\"|\'([^\']*)\'")
    return [matched.group(0) for matched in pattern.finditer(rule)]

def gnf(grammar):
    old_grammar = copy.deepcopy(grammar)
    new_grammar = defaultdict(list)
    isgnf = False
    while not isgnf:
        old_grammar = remove_left_recursion(old_grammar)
        for lhs, rules in old_grammar.items():
            for rule in rules:
                tokens = gettokens(rule)
                if len(tokens) == 1 and isTerminal(rule):
                    new_grammar[lhs].append(rule)
                    continue
                startoken = tokens[0]
                assert(startoken != lhs)
                endrule = tokens[1:]
                if not isTerminal(startoken):
                    newrules = []
                    extendrules = old_grammar[startoken]
                    for extension in extendrules:
                        temprule = endrule[:]
                        temprule.insert(0, extension)
                        newrules.append(temprule)
                    for newnew in newrules:
                        new_grammar[lhs].append(' '.join(newnew))
                else:
                    new_grammar[lhs].append(rule)
        isgnf = True
        for lhs, rules in new_grammar.items():
            for rule in rules:
                # if "\' \'" or isTerminal(rule):
                tokens = gettokens(rule)
                if len(tokens) == 1 and isTerminal(rule):
                    continue
                startoken = tokens[0]
                if not isTerminal(startoken):
                    isgnf = False
                    break
        if not isgnf:
            old_grammar = copy.deepcopy(new_grammar)
            new_grammar = defaultdict(list)
    return new_grammar


def process_antlr4_grammar(data):
    productions = []
    production = []
    for line in data:
        if line != '\n':
            production.append(line)
        else:
            productions.append(production)
            production = []
    final_rule_set = {}
    for production in productions:
        rules = []
        init = production[0]
        nonterminal = init.split(':')[0]
        rules.append(strip_chars(init.split(':')[1]).strip('| '))
        for production_rule in production[1:]:
            rules.append(strip_chars(production_rule.split('|')[0]))
        final_rule_set[nonterminal] = rules
    # for line in data:
    #     if line != '\n':
    #         production.append(line)
    return final_rule_set

def remove_unit(grammar):
    nounitproductions = False
    old_grammar = copy.deepcopy(grammar)
    new_grammar = defaultdict(list)
    while not nounitproductions:
        for lhs, rules in old_grammar.items():
            for rhs in rules:
                # Checking if the rule is a unit production rule
                if len(gettokens(rhs)) == 1:
                    if not isTerminal(rhs):
                        new_grammar[lhs].extend([rule for rule in old_grammar[rhs]])
                    else:
                        new_grammar[lhs].append(rhs)
                else:
                    new_grammar[lhs].append(rhs)
        # Checking there are no unit productions left in the grammar
        nounitproductions = True
        for lhs, rules in new_grammar.items():
            for rhs in rules:
                if len(gettokens(rhs)) == 1:
                    if not isTerminal(rhs):
                        nounitproductions = False
                        break
            if not nounitproductions:
                break
        # Unit productions are still there in the grammar -- repeat the process
        if not nounitproductions:
            old_grammar = copy.deepcopy(new_grammar)
            new_grammar = defaultdict(list)
    return new_grammar

def isTerminal(rule):
    # pattern = re.compile("([r]*\'[\s\S]+\')")
    pattern = re.compile("\'(.*?)\'")
    match = pattern.match(rule)
    if match:
        return True
    else:
        return False

def remove_mixed(grammar):
    '''
    Remove rules where there are terminals mixed in with non-terminals
    '''
    new_grammar = defaultdict(list)
    for lhs, rules in grammar.items():
        for rhs in rules:
            tokens = gettokens(rhs)
            if len(tokens) == 1:
                new_grammar[lhs].append(rhs)
                continue
            regen_rule = [tokens[0]]
            for token in tokens[1:]:
                # print(token, isTerminal(token), regen_rule)
                # Identify if there is a terminal in the RHS
                if isTerminal(token):
                    # Check if a corresponding nonterminal already exists
                    nonterminal = terminal_exist(token, new_grammar)
                    if nonterminal:
                        regen_rule.append(nonterminal)
                    else:
                        new_nonterm = get_nonterminal()
                        new_grammar[new_nonterm].append(token)
                        regen_rule.append(new_nonterm)
                else:
                    regen_rule.append(token)
            new_grammar[lhs].append(' '.join(regen_rule))
    return new_grammar

def strip_chars(rule):
    return rule.strip('\n\t ')

def get_nonterminal():
    global COUNT
    COUNT += 1
    return f"GeneratedTermVar{COUNT}"

def terminal_exist(token, grammar):
    for nonterminal, rules in grammar.items():
        if token in rules and len(token) == 1:
            return nonterminal
    return None


def main(grammar_file, out, start):
    grammar = None
    # If grammar file is a preprocessed NT file, then skip preprocessing
    if '.json' in grammar_file:
        with open(grammar_file, 'r') as fd:
            grammar = json.load(fd)
    elif '.g4' in grammar_file:
        with open(grammar_file, 'r') as fd:
            data = fd.readlines()
        grammar = process_antlr4_grammar(data)
    else:
        raise('Unknwown file format passed. Accepts (.g4/.json)')

    grammar = convert_to_gnf(grammar, start)
    with open(out, 'w+') as fd:
        json.dump(grammar, fd)

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description = 'Script to convert grammar to GNF form')
    parser.add_argument(
            '--gf',
            type = str,
            required = True,
            help = 'Location of grammar file')
    parser.add_argument(
            '--out',
            type = str,
            required = True,
            help = 'Location of output file')
    parser.add_argument(
            '--start',
            type = str,
            required = True,
            help = 'Start token')
    parser.add_argument(
            '--debug',
            action='store_true',
            help = 'Write intermediate states to debug files')
    args = parser.parse_args()
    DEBUG = args.debug

    main(args.gf, args.out, args.start)