mirror of
https://git.cs.ou.nl/joshua.moerman/utf8-learner.git
synced 2025-07-01 14:17:45 +02:00
195 lines
5.7 KiB
Python
195 lines
5.7 KiB
Python
# Copyright 2024-2025 Joshua Moerman, Open Universiteit. All rights reserved
|
|
# SPDX-License-Identifier: EUPL-1.2
|
|
|
|
import itertools
|
|
from pysat.solvers import Solver
|
|
from pysat.card import CardEnc
|
|
from pysat.formula import IDPool
|
|
|
|
# Script to decompose a DFA as the intersection of smaller DFAs.
|
|
# As an example this is applied to the UTF-8 automaton, see
|
|
# https://joshuamoerman.nl/2025/6/The-UTF-8-Automaton.html
|
|
|
|
# Regular language of UTF8 sequences (9 states including a sink state).
|
|
# The model used here is a complete DFA, bytes are mapped to classes
|
|
# as explained here: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
|
|
|
# Possible improvements:
|
|
# - Allow partial models. The current smallest decomposition is as 6+4.
|
|
# Coincidentally, both components have a sink state and we get a
|
|
# decomposition of size 5+3 which is quite good. Encoding partiallity
|
|
# directly will always find a minimal decomposition.
|
|
# - Get rid of input "8", since it is never accepted in a UTF8 string.
|
|
# - Only use reduced_sizes and not size. This will reduce the number of
|
|
# variables and constraints.
|
|
# - Write a loop to search for smallest decompositions, this is now
|
|
# done by hand.
|
|
|
|
# With 2 components, decomposable as:
|
|
# 5+5 NO
|
|
# 6+4 YES
|
|
# 7+3 YES
|
|
# 8+2 NO
|
|
# and everything below: NO
|
|
|
|
# With 3 components, decomposable as:
|
|
# 4+4+4 NO
|
|
# 5+4+3 YES
|
|
# 5+5+2 ???
|
|
# 6+3+3 YES
|
|
# 6+4+2 YES (see above)
|
|
# 7+3+2 YES (see above)
|
|
# 8+2+2 ???
|
|
# any x+y+z such that the sum is 11 or less: NO
|
|
# any x+y+z such that the sum is 13 or more: YES
|
|
def exampleDfa():
|
|
initial = 0
|
|
final = set([0])
|
|
alphabet = list(range(12))
|
|
states = list(range(9))
|
|
trans = {}
|
|
|
|
trans[(0, 0)] = 0
|
|
trans[(0, 2)] = 2
|
|
trans[(0, 3)] = 3
|
|
trans[(0, 4)] = 5
|
|
trans[(0, 5)] = 8
|
|
trans[(0, 6)] = 7
|
|
trans[(0, 10)] = 4
|
|
trans[(0, 11)] = 6
|
|
trans[(2, 1)] = 0
|
|
trans[(2, 7)] = 0
|
|
trans[(2, 9)] = 0
|
|
trans[(3, 1)] = 2
|
|
trans[(3, 7)] = 2
|
|
trans[(3, 9)] = 2
|
|
trans[(4, 7)] = 2
|
|
trans[(5, 1)] = 2
|
|
trans[(5, 9)] = 2
|
|
trans[(6, 7)] = 3
|
|
trans[(6, 9)] = 3
|
|
trans[(7, 1)] = 3
|
|
trans[(7, 7)] = 3
|
|
trans[(7, 9)] = 3
|
|
trans[(8, 1)] = 3
|
|
|
|
assert len(trans) == 23
|
|
|
|
for s in states:
|
|
for a in alphabet:
|
|
if (s,a) not in trans:
|
|
trans[(s,a)] = 1
|
|
return {'initial': initial, 'final': final, 'trans': trans}
|
|
|
|
|
|
class DfaEncode:
|
|
def __init__(self):
|
|
self.components = 3
|
|
self.reduced_sizes = {0: 7, 1: 2, 2: 2}
|
|
print(f'SIZES = {self.reduced_sizes}')
|
|
self.size = max(self.reduced_sizes.values())
|
|
self.alphabet = list(range(12))
|
|
self.dfa = exampleDfa()
|
|
self.dfa_size = 9
|
|
|
|
self.vpool = IDPool()
|
|
self.solver = Solver()
|
|
|
|
def var_bool(self, b):
|
|
return self.vpool.id(('bool', b))
|
|
|
|
def var_trans(self, m, s, a, t):
|
|
assert 0 <= m and m < self.components
|
|
assert 0 <= s and s < self.size
|
|
assert a in self.alphabet
|
|
assert 0 <= t and t < self.size
|
|
|
|
return self.vpool.id(('trans', m, s, a, t))
|
|
|
|
def var_final(self, m, s):
|
|
assert 0 <= m and m < self.components
|
|
assert 0 <= s and s < self.size
|
|
|
|
return self.vpool.id(('final', m, s))
|
|
|
|
def var_bisim(self, ss, org):
|
|
assert len(ss) == self.components
|
|
|
|
return self.vpool.id(('bisim', tuple(ss), org))
|
|
|
|
def constrain_component(self, m):
|
|
for s in range(self.size):
|
|
for a in self.alphabet:
|
|
lits = [self.var_trans(m, s, a, t) for t in range(self.reduced_sizes[m])]
|
|
cnf = CardEnc.equals(lits, 1, vpool=self.vpool)
|
|
self.solver.append_formula(cnf.clauses)
|
|
|
|
def constrain_bisim(self):
|
|
ss_init = [0 for m in range(self.components)]
|
|
org_init = self.dfa['initial']
|
|
|
|
# we require initial states to be bisimilar
|
|
self.solver.add_clause([self.var_bisim(ss_init, org_init)])
|
|
|
|
for org in range(self.dfa_size):
|
|
for ss in itertools.product(range(self.size), repeat=self.components):
|
|
# require intersection of components have the right acceptance
|
|
# if bisim and all components accept => then original dfa accepts
|
|
clause1 = [-self.var_bisim(ss, org)] + [-self.var_final(m, ss[m]) for m in range(self.components)] + [self.var_bool(org in self.dfa['final'])]
|
|
self.solver.add_clause(clause1)
|
|
# if bisim and original dfa accepts => then each component accepts
|
|
for m in range(self.components):
|
|
clause2 = [-self.var_bisim(ss, org)] + [-self.var_bool(org in self.dfa['final'])] + [self.var_final(m, ss[m])]
|
|
self.solver.add_clause(clause2)
|
|
|
|
# require transitions to bisimilar states
|
|
for a in self.alphabet:
|
|
org2 = self.dfa['trans'][(org, a)]
|
|
|
|
for tt in itertools.product(range(self.size), repeat=self.components):
|
|
clause = [-self.var_bisim(ss, org)] + [-self.var_trans(m, ss[m], a, tt[m]) for m in range(self.components)] + [self.var_bisim(tt, org2)]
|
|
self.solver.add_clause(clause)
|
|
|
|
def constraint(self):
|
|
self.solver.add_clause([-self.var_bool(False)])
|
|
self.solver.add_clause([self.var_bool(True)])
|
|
|
|
for m in range(self.components):
|
|
self.constrain_component(m)
|
|
|
|
self.constrain_bisim()
|
|
|
|
def solve(self):
|
|
self.solver.solve()
|
|
|
|
def get_model(self):
|
|
lits = self.solver.get_model()
|
|
model = set([lit for lit in lits if lit > 0])
|
|
|
|
components = {}
|
|
for m in range(self.components):
|
|
dfa = {'trans': {}, 'final': set(), 'initial': 0}
|
|
for s in range(self.size):
|
|
if self.var_final(m, s) in model:
|
|
dfa['final'].add(s)
|
|
|
|
for a in self.alphabet:
|
|
for t in range(self.size):
|
|
if self.var_trans(m, s, a, t) in model:
|
|
dfa['trans'][(s, a)] = t
|
|
|
|
components[m] = dfa
|
|
|
|
return components
|
|
|
|
|
|
def main():
|
|
encoder = DfaEncode()
|
|
encoder.constraint()
|
|
encoder.solver.solve()
|
|
m = encoder.get_model()
|
|
print(m)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|