utf8-learner/dfa-decompose/main.py

#  Copyright 2024-2025 Joshua Moerman, Open Universiteit. All rights reserved
#  SPDX-License-Identifier: EUPL-1.2

import itertools
from pysat.solvers import Solver
from pysat.card import CardEnc
from pysat.formula import IDPool

# Script to decompose a DFA as the intersection of smaller DFAs.
# As an example this is applied to the UTF-8 automaton, see
# https://joshuamoerman.nl/2025/6/The-UTF-8-Automaton.html

# Regular language of UTF8 sequences (9 states including a sink state).
# The model used here is a complete DFA, bytes are mapped to classes
# as explained here: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/

# Possible improvements:
# - Allow partial models. The current smallest decomposition is as 6+4.
#   Coincidentally, both components have a sink state and we get a
#   decomposition of size 5+3 which is quite good. Encoding partiallity
#   directly will always find a minimal decomposition.
# - Get rid of input "8", since it is never accepted in a UTF8 string.
# - Only use reduced_sizes and not size. This will reduce the number of
#   variables and constraints.
# - Write a loop to search for smallest decompositions, this is now
#   done by hand.

# With 2 components, decomposable as:
# 5+5 NO
# 6+4 YES
# 7+3 YES
# 8+2 NO
# and everything below: NO

# With 3 components, decomposable as:
# 4+4+4 NO
# 5+4+3 YES
# 5+5+2 ???
# 6+3+3 YES
# 6+4+2 YES (see above)
# 7+3+2 YES (see above)
# 8+2+2 ???
# any x+y+z such that the sum is 11 or less: NO
# any x+y+z such that the sum is 13 or more: YES
def exampleDfa():
  initial = 0
  final = set([0])
  alphabet = list(range(12))
  states = list(range(9))
  trans = {}

  trans[(0, 0)] = 0
  trans[(0, 2)] = 2
  trans[(0, 3)] = 3
  trans[(0, 4)] = 5
  trans[(0, 5)] = 8
  trans[(0, 6)] = 7
  trans[(0, 10)] = 4
  trans[(0, 11)] = 6
  trans[(2, 1)] = 0
  trans[(2, 7)] = 0
  trans[(2, 9)] = 0
  trans[(3, 1)] = 2
  trans[(3, 7)] = 2
  trans[(3, 9)] = 2
  trans[(4, 7)] = 2
  trans[(5, 1)] = 2
  trans[(5, 9)] = 2
  trans[(6, 7)] = 3
  trans[(6, 9)] = 3
  trans[(7, 1)] = 3
  trans[(7, 7)] = 3
  trans[(7, 9)] = 3
  trans[(8, 1)] = 3

  assert len(trans) == 23

  for s in states:
    for a in alphabet:
      if (s,a) not in trans:
        trans[(s,a)] = 1
  return {'initial': initial, 'final': final, 'trans': trans}


class DfaEncode:
  def __init__(self):
    self.components = 3
    self.reduced_sizes = {0: 7, 1: 2, 2: 2}
    print(f'SIZES = {self.reduced_sizes}')
    self.size = max(self.reduced_sizes.values())
    self.alphabet = list(range(12))
    self.dfa = exampleDfa()
    self.dfa_size = 9

    self.vpool = IDPool()
    self.solver = Solver()

  def var_bool(self, b):
    return self.vpool.id(('bool', b))

  def var_trans(self, m, s, a, t):
    assert 0 <= m and m < self.components
    assert 0 <= s and s < self.size
    assert a in self.alphabet
    assert 0 <= t and t < self.size

    return self.vpool.id(('trans', m, s, a, t))

  def var_final(self, m, s):
    assert 0 <= m and m < self.components
    assert 0 <= s and s < self.size

    return self.vpool.id(('final', m, s))

  def var_bisim(self, ss, org):
    assert len(ss) == self.components

    return self.vpool.id(('bisim', tuple(ss), org))

  def constrain_component(self, m):
    for s in range(self.size):
      for a in self.alphabet:
        lits = [self.var_trans(m, s, a, t) for t in range(self.reduced_sizes[m])]
        cnf = CardEnc.equals(lits, 1, vpool=self.vpool)
        self.solver.append_formula(cnf.clauses)

  def constrain_bisim(self):
    ss_init = [0 for m in range(self.components)]
    org_init = self.dfa['initial']

    # we require initial states to be bisimilar
    self.solver.add_clause([self.var_bisim(ss_init, org_init)])

    for org in range(self.dfa_size):
      for ss in itertools.product(range(self.size), repeat=self.components):
        # require intersection of components have the right acceptance
        # if bisim and all components accept => then original dfa accepts
        clause1 = [-self.var_bisim(ss, org)] + [-self.var_final(m, ss[m]) for m in range(self.components)] + [self.var_bool(org in self.dfa['final'])]
        self.solver.add_clause(clause1)
        # if bisim and original dfa accepts => then each component accepts
        for m in range(self.components):
          clause2 = [-self.var_bisim(ss, org)] + [-self.var_bool(org in self.dfa['final'])] + [self.var_final(m, ss[m])]
          self.solver.add_clause(clause2)

        # require transitions to bisimilar states
        for a in self.alphabet:
          org2 = self.dfa['trans'][(org, a)]

          for tt in itertools.product(range(self.size), repeat=self.components):
            clause = [-self.var_bisim(ss, org)] + [-self.var_trans(m, ss[m], a, tt[m]) for m in range(self.components)] + [self.var_bisim(tt, org2)]
            self.solver.add_clause(clause)

  def constraint(self):
    self.solver.add_clause([-self.var_bool(False)])
    self.solver.add_clause([self.var_bool(True)])

    for m in range(self.components):
      self.constrain_component(m)

    self.constrain_bisim()

  def solve(self):
    self.solver.solve()

  def get_model(self):
    lits = self.solver.get_model()
    model = set([lit for lit in lits if lit > 0])

    components = {}
    for m in range(self.components):
      dfa = {'trans': {}, 'final': set(), 'initial': 0}
      for s in range(self.size):
        if self.var_final(m, s) in model:
          dfa['final'].add(s)

        for a in self.alphabet:
          for t in range(self.size):
            if self.var_trans(m, s, a, t) in model:
              dfa['trans'][(s, a)] = t

      components[m] = dfa

    return components


def main():
  encoder = DfaEncode()
  encoder.constraint()
  encoder.solver.solve()
  m = encoder.get_model()
  print(m)


if __name__ == '__main__':
  main()