# Copyright 2024-2025 Joshua Moerman, Open Universiteit. All rights reserved # SPDX-License-Identifier: EUPL-1.2 import itertools from pysat.solvers import Solver from pysat.card import CardEnc from pysat.formula import IDPool # Script to decompose a DFA as the intersection of smaller DFAs. # As an example this is applied to the UTF-8 automaton, see # https://joshuamoerman.nl/2025/6/The-UTF-8-Automaton.html # Regular language of UTF8 sequences (9 states including a sink state). # The model used here is a complete DFA, bytes are mapped to classes # as explained here: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ # Possible improvements: # - Allow partial models. The current smallest decomposition is as 6+4. # Coincidentally, both components have a sink state and we get a # decomposition of size 5+3 which is quite good. Encoding partiallity # directly will always find a minimal decomposition. # - Get rid of input "8", since it is never accepted in a UTF8 string. # - Only use reduced_sizes and not size. This will reduce the number of # variables and constraints. # - Write a loop to search for smallest decompositions, this is now # done by hand. # With 2 components, decomposable as: # 5+5 NO # 6+4 YES # 7+3 YES # 8+2 NO # and everything below: NO # With 3 components, decomposable as: # 4+4+4 NO # 5+4+3 YES # 5+5+2 ??? # 6+3+3 YES # 6+4+2 YES (see above) # 7+3+2 YES (see above) # 8+2+2 ??? # any x+y+z such that the sum is 11 or less: NO # any x+y+z such that the sum is 13 or more: YES def exampleDfa(): initial = 0 final = set([0]) alphabet = list(range(12)) states = list(range(9)) trans = {} trans[(0, 0)] = 0 trans[(0, 2)] = 2 trans[(0, 3)] = 3 trans[(0, 4)] = 5 trans[(0, 5)] = 8 trans[(0, 6)] = 7 trans[(0, 10)] = 4 trans[(0, 11)] = 6 trans[(2, 1)] = 0 trans[(2, 7)] = 0 trans[(2, 9)] = 0 trans[(3, 1)] = 2 trans[(3, 7)] = 2 trans[(3, 9)] = 2 trans[(4, 7)] = 2 trans[(5, 1)] = 2 trans[(5, 9)] = 2 trans[(6, 7)] = 3 trans[(6, 9)] = 3 trans[(7, 1)] = 3 trans[(7, 7)] = 3 trans[(7, 9)] = 3 trans[(8, 1)] = 3 assert len(trans) == 23 for s in states: for a in alphabet: if (s,a) not in trans: trans[(s,a)] = 1 return {'initial': initial, 'final': final, 'trans': trans} class DfaEncode: def __init__(self): self.components = 3 self.reduced_sizes = {0: 7, 1: 2, 2: 2} print(f'SIZES = {self.reduced_sizes}') self.size = max(self.reduced_sizes.values()) self.alphabet = list(range(12)) self.dfa = exampleDfa() self.dfa_size = 9 self.vpool = IDPool() self.solver = Solver() def var_bool(self, b): return self.vpool.id(('bool', b)) def var_trans(self, m, s, a, t): assert 0 <= m and m < self.components assert 0 <= s and s < self.size assert a in self.alphabet assert 0 <= t and t < self.size return self.vpool.id(('trans', m, s, a, t)) def var_final(self, m, s): assert 0 <= m and m < self.components assert 0 <= s and s < self.size return self.vpool.id(('final', m, s)) def var_bisim(self, ss, org): assert len(ss) == self.components return self.vpool.id(('bisim', tuple(ss), org)) def constrain_component(self, m): for s in range(self.size): for a in self.alphabet: lits = [self.var_trans(m, s, a, t) for t in range(self.reduced_sizes[m])] cnf = CardEnc.equals(lits, 1, vpool=self.vpool) self.solver.append_formula(cnf.clauses) def constrain_bisim(self): ss_init = [0 for m in range(self.components)] org_init = self.dfa['initial'] # we require initial states to be bisimilar self.solver.add_clause([self.var_bisim(ss_init, org_init)]) for org in range(self.dfa_size): for ss in itertools.product(range(self.size), repeat=self.components): # require intersection of components have the right acceptance # if bisim and all components accept => then original dfa accepts clause1 = [-self.var_bisim(ss, org)] + [-self.var_final(m, ss[m]) for m in range(self.components)] + [self.var_bool(org in self.dfa['final'])] self.solver.add_clause(clause1) # if bisim and original dfa accepts => then each component accepts for m in range(self.components): clause2 = [-self.var_bisim(ss, org)] + [-self.var_bool(org in self.dfa['final'])] + [self.var_final(m, ss[m])] self.solver.add_clause(clause2) # require transitions to bisimilar states for a in self.alphabet: org2 = self.dfa['trans'][(org, a)] for tt in itertools.product(range(self.size), repeat=self.components): clause = [-self.var_bisim(ss, org)] + [-self.var_trans(m, ss[m], a, tt[m]) for m in range(self.components)] + [self.var_bisim(tt, org2)] self.solver.add_clause(clause) def constraint(self): self.solver.add_clause([-self.var_bool(False)]) self.solver.add_clause([self.var_bool(True)]) for m in range(self.components): self.constrain_component(m) self.constrain_bisim() def solve(self): self.solver.solve() def get_model(self): lits = self.solver.get_model() model = set([lit for lit in lits if lit > 0]) components = {} for m in range(self.components): dfa = {'trans': {}, 'final': set(), 'initial': 0} for s in range(self.size): if self.var_final(m, s) in model: dfa['final'].add(s) for a in self.alphabet: for t in range(self.size): if self.var_trans(m, s, a, t) in model: dfa['trans'][(s, a)] = t components[m] = dfa return components def main(): encoder = DfaEncode() encoder.constraint() encoder.solver.solve() m = encoder.get_model() print(m) if __name__ == '__main__': main()