1
Fork 0
mirror of https://git.cs.ou.nl/joshua.moerman/utf8-learner.git synced 2025-07-01 14:17:45 +02:00
utf8-learner/dfa-decompose/main.py
2025-06-13 13:21:37 +02:00

195 lines
5.7 KiB
Python

# Copyright 2024-2025 Joshua Moerman, Open Universiteit. All rights reserved
# SPDX-License-Identifier: EUPL-1.2
import itertools
from pysat.solvers import Solver
from pysat.card import CardEnc
from pysat.formula import IDPool
# Script to decompose a DFA as the intersection of smaller DFAs.
# As an example this is applied to the UTF-8 automaton, see
# https://joshuamoerman.nl/2025/6/The-UTF-8-Automaton.html
# Regular language of UTF8 sequences (9 states including a sink state).
# The model used here is a complete DFA, bytes are mapped to classes
# as explained here: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
# Possible improvements:
# - Allow partial models. The current smallest decomposition is as 6+4.
# Coincidentally, both components have a sink state and we get a
# decomposition of size 5+3 which is quite good. Encoding partiallity
# directly will always find a minimal decomposition.
# - Get rid of input "8", since it is never accepted in a UTF8 string.
# - Only use reduced_sizes and not size. This will reduce the number of
# variables and constraints.
# - Write a loop to search for smallest decompositions, this is now
# done by hand.
# With 2 components, decomposable as:
# 5+5 NO
# 6+4 YES
# 7+3 YES
# 8+2 NO
# and everything below: NO
# With 3 components, decomposable as:
# 4+4+4 NO
# 5+4+3 YES
# 5+5+2 ???
# 6+3+3 YES
# 6+4+2 YES (see above)
# 7+3+2 YES (see above)
# 8+2+2 ???
# any x+y+z such that the sum is 11 or less: NO
# any x+y+z such that the sum is 13 or more: YES
def exampleDfa():
initial = 0
final = set([0])
alphabet = list(range(12))
states = list(range(9))
trans = {}
trans[(0, 0)] = 0
trans[(0, 2)] = 2
trans[(0, 3)] = 3
trans[(0, 4)] = 5
trans[(0, 5)] = 8
trans[(0, 6)] = 7
trans[(0, 10)] = 4
trans[(0, 11)] = 6
trans[(2, 1)] = 0
trans[(2, 7)] = 0
trans[(2, 9)] = 0
trans[(3, 1)] = 2
trans[(3, 7)] = 2
trans[(3, 9)] = 2
trans[(4, 7)] = 2
trans[(5, 1)] = 2
trans[(5, 9)] = 2
trans[(6, 7)] = 3
trans[(6, 9)] = 3
trans[(7, 1)] = 3
trans[(7, 7)] = 3
trans[(7, 9)] = 3
trans[(8, 1)] = 3
assert len(trans) == 23
for s in states:
for a in alphabet:
if (s,a) not in trans:
trans[(s,a)] = 1
return {'initial': initial, 'final': final, 'trans': trans}
class DfaEncode:
def __init__(self):
self.components = 3
self.reduced_sizes = {0: 7, 1: 2, 2: 2}
print(f'SIZES = {self.reduced_sizes}')
self.size = max(self.reduced_sizes.values())
self.alphabet = list(range(12))
self.dfa = exampleDfa()
self.dfa_size = 9
self.vpool = IDPool()
self.solver = Solver()
def var_bool(self, b):
return self.vpool.id(('bool', b))
def var_trans(self, m, s, a, t):
assert 0 <= m and m < self.components
assert 0 <= s and s < self.size
assert a in self.alphabet
assert 0 <= t and t < self.size
return self.vpool.id(('trans', m, s, a, t))
def var_final(self, m, s):
assert 0 <= m and m < self.components
assert 0 <= s and s < self.size
return self.vpool.id(('final', m, s))
def var_bisim(self, ss, org):
assert len(ss) == self.components
return self.vpool.id(('bisim', tuple(ss), org))
def constrain_component(self, m):
for s in range(self.size):
for a in self.alphabet:
lits = [self.var_trans(m, s, a, t) for t in range(self.reduced_sizes[m])]
cnf = CardEnc.equals(lits, 1, vpool=self.vpool)
self.solver.append_formula(cnf.clauses)
def constrain_bisim(self):
ss_init = [0 for m in range(self.components)]
org_init = self.dfa['initial']
# we require initial states to be bisimilar
self.solver.add_clause([self.var_bisim(ss_init, org_init)])
for org in range(self.dfa_size):
for ss in itertools.product(range(self.size), repeat=self.components):
# require intersection of components have the right acceptance
# if bisim and all components accept => then original dfa accepts
clause1 = [-self.var_bisim(ss, org)] + [-self.var_final(m, ss[m]) for m in range(self.components)] + [self.var_bool(org in self.dfa['final'])]
self.solver.add_clause(clause1)
# if bisim and original dfa accepts => then each component accepts
for m in range(self.components):
clause2 = [-self.var_bisim(ss, org)] + [-self.var_bool(org in self.dfa['final'])] + [self.var_final(m, ss[m])]
self.solver.add_clause(clause2)
# require transitions to bisimilar states
for a in self.alphabet:
org2 = self.dfa['trans'][(org, a)]
for tt in itertools.product(range(self.size), repeat=self.components):
clause = [-self.var_bisim(ss, org)] + [-self.var_trans(m, ss[m], a, tt[m]) for m in range(self.components)] + [self.var_bisim(tt, org2)]
self.solver.add_clause(clause)
def constraint(self):
self.solver.add_clause([-self.var_bool(False)])
self.solver.add_clause([self.var_bool(True)])
for m in range(self.components):
self.constrain_component(m)
self.constrain_bisim()
def solve(self):
self.solver.solve()
def get_model(self):
lits = self.solver.get_model()
model = set([lit for lit in lits if lit > 0])
components = {}
for m in range(self.components):
dfa = {'trans': {}, 'final': set(), 'initial': 0}
for s in range(self.size):
if self.var_final(m, s) in model:
dfa['final'].add(s)
for a in self.alphabet:
for t in range(self.size):
if self.var_trans(m, s, a, t) in model:
dfa['trans'][(s, a)] = t
components[m] = dfa
return components
def main():
encoder = DfaEncode()
encoder.constraint()
encoder.solver.solve()
m = encoder.get_model()
print(m)
if __name__ == '__main__':
main()