1
Fork 0
mirror of https://git.cs.ou.nl/joshua.moerman/utf8-learner.git synced 2025-07-01 14:17:45 +02:00

already did all the things :-)

This commit is contained in:
Joshua Moerman 2025-06-13 13:21:37 +02:00
parent 9a09a24df3
commit 65f891e731
36 changed files with 7873 additions and 0 deletions

5
dfa-decompose/.ruff.toml Normal file
View file

@ -0,0 +1,5 @@
indent-width = 2
line-length = 320
[format]
quote-style = "single"

20
dfa-decompose/README.md Normal file
View file

@ -0,0 +1,20 @@
dfa-decompose
=============
Given a DFA, try to find smaller DFAs such that their intersection is the
input.
```
pip install -r requirements.txt
python main.py
```
## Copyright notice
(c) 2025 Joshua Moerman, Open Universiteit, licensed under the EUPL (European
Union Public License). If you want to use this code and find the license not
suitable for you, then please do get in touch.
```
SPDX-License-Identifier: EUPL-1.2
```

195
dfa-decompose/main.py Normal file
View file

@ -0,0 +1,195 @@
# Copyright 2024-2025 Joshua Moerman, Open Universiteit. All rights reserved
# SPDX-License-Identifier: EUPL-1.2
import itertools
from pysat.solvers import Solver
from pysat.card import CardEnc
from pysat.formula import IDPool
# Script to decompose a DFA as the intersection of smaller DFAs.
# As an example this is applied to the UTF-8 automaton, see
# https://joshuamoerman.nl/2025/6/The-UTF-8-Automaton.html
# Regular language of UTF8 sequences (9 states including a sink state).
# The model used here is a complete DFA, bytes are mapped to classes
# as explained here: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
# Possible improvements:
# - Allow partial models. The current smallest decomposition is as 6+4.
# Coincidentally, both components have a sink state and we get a
# decomposition of size 5+3 which is quite good. Encoding partiallity
# directly will always find a minimal decomposition.
# - Get rid of input "8", since it is never accepted in a UTF8 string.
# - Only use reduced_sizes and not size. This will reduce the number of
# variables and constraints.
# - Write a loop to search for smallest decompositions, this is now
# done by hand.
# With 2 components, decomposable as:
# 5+5 NO
# 6+4 YES
# 7+3 YES
# 8+2 NO
# and everything below: NO
# With 3 components, decomposable as:
# 4+4+4 NO
# 5+4+3 YES
# 5+5+2 ???
# 6+3+3 YES
# 6+4+2 YES (see above)
# 7+3+2 YES (see above)
# 8+2+2 ???
# any x+y+z such that the sum is 11 or less: NO
# any x+y+z such that the sum is 13 or more: YES
def exampleDfa():
initial = 0
final = set([0])
alphabet = list(range(12))
states = list(range(9))
trans = {}
trans[(0, 0)] = 0
trans[(0, 2)] = 2
trans[(0, 3)] = 3
trans[(0, 4)] = 5
trans[(0, 5)] = 8
trans[(0, 6)] = 7
trans[(0, 10)] = 4
trans[(0, 11)] = 6
trans[(2, 1)] = 0
trans[(2, 7)] = 0
trans[(2, 9)] = 0
trans[(3, 1)] = 2
trans[(3, 7)] = 2
trans[(3, 9)] = 2
trans[(4, 7)] = 2
trans[(5, 1)] = 2
trans[(5, 9)] = 2
trans[(6, 7)] = 3
trans[(6, 9)] = 3
trans[(7, 1)] = 3
trans[(7, 7)] = 3
trans[(7, 9)] = 3
trans[(8, 1)] = 3
assert len(trans) == 23
for s in states:
for a in alphabet:
if (s,a) not in trans:
trans[(s,a)] = 1
return {'initial': initial, 'final': final, 'trans': trans}
class DfaEncode:
def __init__(self):
self.components = 3
self.reduced_sizes = {0: 7, 1: 2, 2: 2}
print(f'SIZES = {self.reduced_sizes}')
self.size = max(self.reduced_sizes.values())
self.alphabet = list(range(12))
self.dfa = exampleDfa()
self.dfa_size = 9
self.vpool = IDPool()
self.solver = Solver()
def var_bool(self, b):
return self.vpool.id(('bool', b))
def var_trans(self, m, s, a, t):
assert 0 <= m and m < self.components
assert 0 <= s and s < self.size
assert a in self.alphabet
assert 0 <= t and t < self.size
return self.vpool.id(('trans', m, s, a, t))
def var_final(self, m, s):
assert 0 <= m and m < self.components
assert 0 <= s and s < self.size
return self.vpool.id(('final', m, s))
def var_bisim(self, ss, org):
assert len(ss) == self.components
return self.vpool.id(('bisim', tuple(ss), org))
def constrain_component(self, m):
for s in range(self.size):
for a in self.alphabet:
lits = [self.var_trans(m, s, a, t) for t in range(self.reduced_sizes[m])]
cnf = CardEnc.equals(lits, 1, vpool=self.vpool)
self.solver.append_formula(cnf.clauses)
def constrain_bisim(self):
ss_init = [0 for m in range(self.components)]
org_init = self.dfa['initial']
# we require initial states to be bisimilar
self.solver.add_clause([self.var_bisim(ss_init, org_init)])
for org in range(self.dfa_size):
for ss in itertools.product(range(self.size), repeat=self.components):
# require intersection of components have the right acceptance
# if bisim and all components accept => then original dfa accepts
clause1 = [-self.var_bisim(ss, org)] + [-self.var_final(m, ss[m]) for m in range(self.components)] + [self.var_bool(org in self.dfa['final'])]
self.solver.add_clause(clause1)
# if bisim and original dfa accepts => then each component accepts
for m in range(self.components):
clause2 = [-self.var_bisim(ss, org)] + [-self.var_bool(org in self.dfa['final'])] + [self.var_final(m, ss[m])]
self.solver.add_clause(clause2)
# require transitions to bisimilar states
for a in self.alphabet:
org2 = self.dfa['trans'][(org, a)]
for tt in itertools.product(range(self.size), repeat=self.components):
clause = [-self.var_bisim(ss, org)] + [-self.var_trans(m, ss[m], a, tt[m]) for m in range(self.components)] + [self.var_bisim(tt, org2)]
self.solver.add_clause(clause)
def constraint(self):
self.solver.add_clause([-self.var_bool(False)])
self.solver.add_clause([self.var_bool(True)])
for m in range(self.components):
self.constrain_component(m)
self.constrain_bisim()
def solve(self):
self.solver.solve()
def get_model(self):
lits = self.solver.get_model()
model = set([lit for lit in lits if lit > 0])
components = {}
for m in range(self.components):
dfa = {'trans': {}, 'final': set(), 'initial': 0}
for s in range(self.size):
if self.var_final(m, s) in model:
dfa['final'].add(s)
for a in self.alphabet:
for t in range(self.size):
if self.var_trans(m, s, a, t) in model:
dfa['trans'][(s, a)] = t
components[m] = dfa
return components
def main():
encoder = DfaEncode()
encoder.constraint()
encoder.solver.solve()
m = encoder.get_model()
print(m)
if __name__ == '__main__':
main()

View file

@ -0,0 +1 @@
python-sat

View file

@ -0,0 +1,16 @@
{0:
{'trans':
{(0, 0): 0, (0, 1): 4, (0, 2): 2, (0, 3): 5, (0, 4): 5, (0, 5): 3, (0, 6): 1, (0, 7): 4, (0, 8): 4, (0, 9): 4, (0, 10): 3, (0, 11): 1, (1, 0): 4, (1, 1): 5, (1, 2): 4, (1, 3): 4, (1, 4): 4, (1, 5): 4, (1, 6): 4, (1, 7): 5, (1, 8): 4, (1, 9): 5, (1, 10): 4, (1, 11): 4, (2, 0): 4, (2, 1): 0, (2, 2): 4, (2, 3): 4, (2, 4): 4, (2, 5): 4, (2, 6): 4, (2, 7): 0, (2, 8): 1, (2, 9): 0, (2, 10): 4, (2, 11): 4, (3, 0): 4, (3, 1): 5, (3, 2): 4, (3, 3): 4, (3, 4): 4, (3, 5): 4, (3, 6): 4, (3, 7): 2, (3, 8): 4, (3, 9): 4, (3, 10): 4, (3, 11): 4, (4, 0): 4, (4, 1): 4, (4, 2): 4, (4, 3): 4, (4, 4): 4, (4, 5): 4, (4, 6): 4, (4, 7): 4, (4, 8): 4, (4, 9): 4, (4, 10): 4, (4, 11): 4, (5, 0): 4, (5, 1): 2, (5, 2): 4, (5, 3): 4, (5, 4): 4, (5, 5): 4, (5, 6): 4, (5, 7): 2, (5, 8): 4, (5, 9): 2, (5, 10): 4, (5, 11): 4},
'final':
{0},
'initial':
0},
1:
{'trans':
{(0, 0): 1, (0, 1): 1, (0, 2): 1, (0, 3): 1, (0, 4): 0, (0, 5): 0, (0, 6): 1, (0, 7): 2, (0, 8): 1, (0, 9): 1, (0, 10): 3, (0, 11): 3, (1, 0): 1, (1, 1): 1, (1, 2): 1, (1, 3): 1, (1, 4): 0, (1, 5): 0, (1, 6): 1, (1, 7): 1, (1, 8): 2, (1, 9): 1, (1, 10): 3, (1, 11): 3, (2, 0): 2, (2, 1): 2, (2, 2): 2, (2, 3): 2, (2, 4): 2, (2, 5): 2, (2, 6): 2, (2, 7): 2, (2, 8): 2, (2, 9): 2, (2, 10): 2, (2, 11): 2, (3, 0): 3, (3, 1): 2, (3, 2): 0, (3, 3): 2, (3, 4): 2, (3, 5): 3, (3, 6): 1, (3, 7): 1, (3, 8): 3, (3, 9): 1, (3, 10): 0, (3, 11): 0, (4, 0): 3, (4, 1): 3, (4, 2): 3, (4, 3): 3, (4, 4): 3, (4, 5): 3, (4, 6): 3, (4, 7): 3, (4, 8): 3, (4, 9): 3, (4, 10): 3, (4, 11): 3, (5, 0): 3, (5, 1): 3, (5, 2): 3, (5, 3): 3, (5, 4): 3, (5, 5): 3, (5, 6): 3, (5, 7): 3, (5, 8): 3, (5, 9): 3, (5, 10): 3, (5, 11): 3},
'final':
{0, 1},
'initial':
0}
}

View file

@ -0,0 +1,10 @@
{
0: {
'trans': {(0, 0): 0, (0, 1): 1, (0, 2): 5, (0, 3): 6, (0, 4): 6, (0, 5): 4, (0, 6): 2, (0, 7): 1, (0, 8): 6, (0, 9): 1, (0, 10): 4, (0, 11): 3, (1, 0): 1, (1, 1): 1, (1, 2): 1, (1, 3): 1, (1, 4): 1, (1, 5): 1, (1, 6): 1, (1, 7): 1, (1, 8): 6, (1, 9): 1, (1, 10): 1, (1, 11): 1, (2, 0): 1, (2, 1): 6, (2, 2): 1, (2, 3): 1, (2, 4): 1, (2, 5): 1, (2, 6): 1, (2, 7): 6, (2, 8): 6, (2, 9): 6, (2, 10): 1, (2, 11): 1, (3, 0): 1, (3, 1): 1, (3, 2): 1, (3, 3): 1, (3, 4): 1, (3, 5): 1, (3, 6): 1, (3, 7): 6, (3, 8): 6, (3, 9): 6, (3, 10): 1, (3, 11): 1, (4, 0): 1, (4, 1): 6, (4, 2): 1, (4, 3): 1, (4, 4): 0, (4, 5): 1, (4, 6): 1, (4, 7): 5, (4, 8): 1, (4, 9): 1, (4, 10): 1, (4, 11): 1, (5, 0): 1, (5, 1): 0, (5, 2): 1, (5, 3): 1, (5, 4): 1, (5, 5): 1, (5, 6): 1, (5, 7): 0, (5, 8): 6, (5, 9): 0, (5, 10): 1, (5, 11): 1, (6, 0): 1, (6, 1): 5, (6, 2): 1, (6, 3): 1, (6, 4): 1, (6, 5): 1, (6, 6): 1, (6, 7): 5, (6, 8): 6, (6, 9): 5, (6, 10): 1, (6, 11): 1},
'final': {0},
'initial': 0},
1: {
'trans': {(0, 0): 0, (0, 1): 2, (0, 2): 2, (0, 3): 0, (0, 4): 1, (0, 5): 2, (0, 6): 2, (0, 7): 2, (0, 8): 2, (0, 9): 2, (0, 10): 0, (0, 11): 2, (1, 0): 1, (1, 1): 2, (1, 2): 0, (1, 3): 2, (1, 4): 2, (1, 5): 1, (1, 6): 0, (1, 7): 0, (1, 8): 2, (1, 9): 2, (1, 10): 1, (1, 11): 0, (2, 0): 2, (2, 1): 0, (2, 2): 0, (2, 3): 2, (2, 4): 2, (2, 5): 1, (2, 6): 0, (2, 7): 0, (2, 8): 2, (2, 9): 0, (2, 10): 1, (2, 11): 0, (3, 0): 2, (3, 1): 2, (3, 2): 2, (3, 3): 2, (3, 4): 2, (3, 5): 2, (3, 6): 2, (3, 7): 2, (3, 8): 2, (3, 9): 2, (3, 10): 2, (3, 11): 2, (4, 0): 2, (4, 1): 2, (4, 2): 2, (4, 3): 2, (4, 4): 2, (4, 5): 2, (4, 6): 2, (4, 7): 2, (4, 8): 2, (4, 9): 2, (4, 10): 2, (4, 11): 2, (5, 0): 2, (5, 1): 2, (5, 2): 2, (5, 3): 2, (5, 4): 2, (5, 5): 2, (5, 6): 2, (5, 7): 2, (5, 8): 2, (5, 9): 2, (5, 10): 2, (5, 11): 2, (6, 0): 2, (6, 1): 2, (6, 2): 2, (6, 3): 2, (6, 4): 2, (6, 5): 2, (6, 6): 2, (6, 7): 2, (6, 8): 2, (6, 9): 2, (6, 10): 2, (6, 11): 2},
'final': {0},
'initial': 0}
}

View file

@ -0,0 +1,16 @@
8,5+ YES
/ \
7,5 YES 8,4 YES
/ \ / \
6,5 YES 7,4 YES 8,3 YES
/ \ / \ / \
5,5 no 6,4 YES 7,3 YES 8,2 no
\ / \ / \ /
5,4 no 6,3 no 7,2 no
/ \ / \ /
4,4 no 5,3 no 6,2 no
\ / \ /
4,3 no 5,2 no
\ /
4-,2 no

View file

@ -0,0 +1,15 @@
digraph g {
s0 [shape="doublecircle" label="s0"];
s0 -> s0 [label="0x00-0x7F"]
s0 -> s2 [label="0xC2-0xDF"]
s0 -> s5 [label="0xE1-0xEF"]
s0 -> s3 [label="0xE0, 0xF4"]
s0 -> s1 [label="0xF0-0xF3"]
s1 -> s5 [label="0x80-0xBF"]
s2 -> s0 [label="0x80-0xBF"]
s3 -> s5 [label="0x80-0x8F"]
s3 -> s2 [label="0xA0-0xBF"]
s5 -> s2 [label="0x80-0xBF"]
__start0 [label="" shape="none" width="0" height="0"];
__start0 -> s0;
}

Binary file not shown.

View file

@ -0,0 +1,15 @@
digraph g {
s0 [shape="doublecircle" label="s0"];
s1 [shape="doublecircle" label="s1"];
s0 -> s1 [label="0x00-0x9F,\n0xC2-0xDF,\n0xE1-0xEC,\n0xEE-0xEF,\n0xF1-0xF3"]
s0 -> s0 [label="0xED, 0xF4"]
s0 -> s3 [label="0xE0, 0xF0"]
s1 -> s1 [label="0x00-0xBF, 0xC2-0xDF, 0xE1-0xEC, 0xEE-0xEF, 0xF1-0xF3"]
s1 -> s0 [label="0xED, 0xF4"]
s1 -> s3 [label="0xE0, 0xF0"]
s3 -> s3 [label="0x00-0x7F, 0xF4"]
s3 -> s0 [label="0xC2-0xE0, 0xF0"]
s3 -> s1 [label="0x90-0xBF, 0xF1-0xF3"]
__start0 [label="" shape="none" width="0" height="0"];
__start0 -> s0;
}

Binary file not shown.

View file

@ -0,0 +1,2 @@
SIZES = {0: 5, 1: 4, 2: 3}
{0: {'trans': {(0, 0): 0, (0, 1): 3, (0, 2): 2, (0, 3): 4, (0, 4): 4, (0, 5): 1, (0, 6): 1, (0, 7): 3, (0, 8): 2, (0, 9): 3, (0, 10): 4, (0, 11): 1, (1, 0): 3, (1, 1): 4, (1, 2): 3, (1, 3): 3, (1, 4): 3, (1, 5): 3, (1, 6): 3, (1, 7): 4, (1, 8): 2, (1, 9): 4, (1, 10): 0, (1, 11): 3, (2, 0): 3, (2, 1): 0, (2, 2): 3, (2, 3): 3, (2, 4): 3, (2, 5): 3, (2, 6): 3, (2, 7): 0, (2, 8): 2, (2, 9): 0, (2, 10): 0, (2, 11): 3, (3, 0): 3, (3, 1): 3, (3, 2): 3, (3, 3): 3, (3, 4): 3, (3, 5): 3, (3, 6): 3, (3, 7): 3, (3, 8): 2, (3, 9): 3, (3, 10): 0, (3, 11): 3, (4, 0): 3, (4, 1): 2, (4, 2): 3, (4, 3): 3, (4, 4): 3, (4, 5): 3, (4, 6): 3, (4, 7): 2, (4, 8): 2, (4, 9): 2, (4, 10): 0, (4, 11): 3}, 'final': {0}, 'initial': 0}, 1: {'trans': {(0, 0): 1, (0, 1): 3, (0, 2): 3, (0, 3): 3, (0, 4): 0, (0, 5): 0, (0, 6): 3, (0, 7): 2, (0, 8): 3, (0, 9): 3, (0, 10): 1, (0, 11): 1, (1, 0): 1, (1, 1): 2, (1, 2): 3, (1, 3): 3, (1, 4): 0, (1, 5): 0, (1, 6): 3, (1, 7): 3, (1, 8): 3, (1, 9): 3, (1, 10): 1, (1, 11): 1, (2, 0): 2, (2, 1): 2, (2, 2): 2, (2, 3): 2, (2, 4): 2, (2, 5): 2, (2, 6): 2, (2, 7): 2, (2, 8): 3, (2, 9): 2, (2, 10): 2, (2, 11): 2, (3, 0): 0, (3, 1): 3, (3, 2): 3, (3, 3): 3, (3, 4): 0, (3, 5): 0, (3, 6): 3, (3, 7): 3, (3, 8): 3, (3, 9): 3, (3, 10): 1, (3, 11): 1, (4, 0): 3, (4, 1): 3, (4, 2): 3, (4, 3): 3, (4, 4): 3, (4, 5): 3, (4, 6): 3, (4, 7): 3, (4, 8): 3, (4, 9): 3, (4, 10): 3, (4, 11): 3}, 'final': {0, 1, 3}, 'initial': 0}, 2: {'trans': {(0, 0): 0, (0, 1): 0, (0, 2): 0, (0, 3): 0, (0, 4): 0, (0, 5): 1, (0, 6): 0, (0, 7): 0, (0, 8): 2, (0, 9): 0, (0, 10): 1, (0, 11): 0, (1, 0): 2, (1, 1): 0, (1, 2): 2, (1, 3): 2, (1, 4): 2, (1, 5): 2, (1, 6): 2, (1, 7): 0, (1, 8): 2, (1, 9): 2, (1, 10): 2, (1, 11): 2, (2, 0): 2, (2, 1): 2, (2, 2): 2, (2, 3): 2, (2, 4): 2, (2, 5): 2, (2, 6): 2, (2, 7): 2, (2, 8): 2, (2, 9): 2, (2, 10): 2, (2, 11): 2, (3, 0): 2, (3, 1): 2, (3, 2): 2, (3, 3): 2, (3, 4): 2, (3, 5): 2, (3, 6): 2, (3, 7): 2, (3, 8): 2, (3, 9): 2, (3, 10): 2, (3, 11): 2, (4, 0): 2, (4, 1): 2, (4, 2): 2, (4, 3): 2, (4, 4): 2, (4, 5): 2, (4, 6): 2, (4, 7): 2, (4, 8): 2, (4, 9): 2, (4, 10): 2, (4, 11): 2}, 'final': {0}, 'initial': 0}}

View file

@ -0,0 +1,2 @@
SIZES = {0: 6, 1: 3, 2: 3}
{0: {'trans': {(0, 0): 0, (0, 1): 1, (0, 2): 2, (0, 3): 4, (0, 4): 4, (0, 5): 5, (0, 6): 3, (0, 7): 1, (0, 8): 5, (0, 9): 1, (0, 10): 5, (0, 11): 3, (1, 0): 1, (1, 1): 1, (1, 2): 1, (1, 3): 1, (1, 4): 1, (1, 5): 1, (1, 6): 1, (1, 7): 1, (1, 8): 5, (1, 9): 1, (1, 10): 1, (1, 11): 1, (2, 0): 1, (2, 1): 0, (2, 2): 1, (2, 3): 1, (2, 4): 1, (2, 5): 1, (2, 6): 1, (2, 7): 0, (2, 8): 5, (2, 9): 0, (2, 10): 1, (2, 11): 1, (3, 0): 1, (3, 1): 4, (3, 2): 1, (3, 3): 1, (3, 4): 1, (3, 5): 1, (3, 6): 1, (3, 7): 4, (3, 8): 5, (3, 9): 4, (3, 10): 1, (3, 11): 1, (4, 0): 1, (4, 1): 2, (4, 2): 1, (4, 3): 1, (4, 4): 1, (4, 5): 1, (4, 6): 1, (4, 7): 2, (4, 8): 5, (4, 9): 2, (4, 10): 1, (4, 11): 1, (5, 0): 1, (5, 1): 4, (5, 2): 1, (5, 3): 1, (5, 4): 1, (5, 5): 1, (5, 6): 1, (5, 7): 2, (5, 8): 5, (5, 9): 1, (5, 10): 1, (5, 11): 1}, 'final': {0}, 'initial': 0}, 1: {'trans': {(0, 0): 0, (0, 1): 1, (0, 2): 1, (0, 3): 2, (0, 4): 0, (0, 5): 1, (0, 6): 1, (0, 7): 2, (0, 8): 0, (0, 9): 1, (0, 10): 2, (0, 11): 1, (1, 0): 1, (1, 1): 2, (1, 2): 2, (1, 3): 1, (1, 4): 1, (1, 5): 0, (1, 6): 2, (1, 7): 2, (1, 8): 0, (1, 9): 2, (1, 10): 0, (1, 11): 2, (2, 0): 2, (2, 1): 1, (2, 2): 1, (2, 3): 2, (2, 4): 0, (2, 5): 1, (2, 6): 1, (2, 7): 1, (2, 8): 0, (2, 9): 1, (2, 10): 2, (2, 11): 1, (3, 0): 2, (3, 1): 2, (3, 2): 2, (3, 3): 2, (3, 4): 2, (3, 5): 2, (3, 6): 2, (3, 7): 2, (3, 8): 2, (3, 9): 2, (3, 10): 2, (3, 11): 2, (4, 0): 2, (4, 1): 2, (4, 2): 2, (4, 3): 2, (4, 4): 2, (4, 5): 2, (4, 6): 2, (4, 7): 2, (4, 8): 2, (4, 9): 2, (4, 10): 2, (4, 11): 2, (5, 0): 2, (5, 1): 2, (5, 2): 2, (5, 3): 2, (5, 4): 2, (5, 5): 2, (5, 6): 2, (5, 7): 2, (5, 8): 2, (5, 9): 2, (5, 10): 2, (5, 11): 2}, 'final': {0, 2}, 'initial': 0}, 2: {'trans': {(0, 0): 0, (0, 1): 2, (0, 2): 1, (0, 3): 1, (0, 4): 1, (0, 5): 1, (0, 6): 1, (0, 7): 1, (0, 8): 1, (0, 9): 1, (0, 10): 0, (0, 11): 0, (1, 0): 1, (1, 1): 1, (1, 2): 1, (1, 3): 1, (1, 4): 1, (1, 5): 1, (1, 6): 1, (1, 7): 1, (1, 8): 1, (1, 9): 1, (1, 10): 0, (1, 11): 0, (2, 0): 2, (2, 1): 2, (2, 2): 2, (2, 3): 2, (2, 4): 2, (2, 5): 2, (2, 6): 2, (2, 7): 2, (2, 8): 1, (2, 9): 2, (2, 10): 2, (2, 11): 2, (3, 0): 2, (3, 1): 2, (3, 2): 2, (3, 3): 2, (3, 4): 2, (3, 5): 2, (3, 6): 2, (3, 7): 2, (3, 8): 2, (3, 9): 2, (3, 10): 2, (3, 11): 2, (4, 0): 2, (4, 1): 2, (4, 2): 2, (4, 3): 2, (4, 4): 2, (4, 5): 2, (4, 6): 2, (4, 7): 2, (4, 8): 2, (4, 9): 2, (4, 10): 2, (4, 11): 2, (5, 0): 2, (5, 1): 2, (5, 2): 2, (5, 3): 2, (5, 4): 2, (5, 5): 2, (5, 6): 2, (5, 7): 2, (5, 8): 2, (5, 9): 2, (5, 10): 2, (5, 11): 2}, 'final': {0, 1}, 'initial': 0}}

View file

@ -0,0 +1,17 @@
digraph g{
0 -> 0 [label="[0]"]
0 -> 1 [label="[5, 6, 11]"]
0 -> 2 [label="[2]"]
0 -> 3 [label="[1, 7, 9]"]
0 -> 4 [label="[3, 4, 10]"]
1 -> 0 [label="[10]"]
1 -> 3 [label="[0, 2, 3, 4, 5, 6, 11]"]
1 -> 4 [label="[1, 7, 9]"]
2 -> 0 [label="[1, 7, 9, 10]"]
2 -> 3 [label="[0, 2, 3, 4, 5, 6, 11]"]
3 -> 0 [label="[10]"]
3 -> 3 [label="[0, 1, 2, 3, 4, 5, 6, 7, 9, 11]"]
4 -> 0 [label="[10]"]
4 -> 2 [label="[1, 7, 9]"]
4 -> 3 [label="[0, 2, 3, 4, 5, 6, 11]"]
}

Binary file not shown.

View file

@ -0,0 +1,11 @@
digraph g{
0 -> 0 [label="[4, 5]"]
0 -> 1 [label="[0, 10, 11]"]
0 -> 3 [label="[1, 2, 3, 6, 9]"]
1 -> 0 [label="[4, 5]"]
1 -> 1 [label="[0, 10, 11]"]
1 -> 3 [label="[2, 3, 6, 7, 9]"]
3 -> 0 [label="[0, 4, 5]"]
3 -> 1 [label="[10, 11]"]
3 -> 3 [label="[1, 2, 3, 6, 7, 9]"]
}

Binary file not shown.

View file

@ -0,0 +1,5 @@
digraph g{
0 -> 0 [label="[0, 1, 2, 3, 4, 6, 7, 9, 11]"]
0 -> 1 [label="[5, 10]"]
1 -> 0 [label="[1, 7]"]
}

Binary file not shown.

View file

@ -0,0 +1,170 @@
digraph g {
t5x4x4 -> t6x4x4
t4x3x2 -> t4x4x2
t7x5x2 -> t7x5x3
t8x4x4 -> t8x5x4
t7x7x4 -> t7x7x5
t6x5x2 -> t6x6x2
t7x6x3 -> t7x6x4
t8x6x5 -> t8x7x5
t5x3x2 -> t6x3x2
t6x5x3 -> t6x5x4
t7x3x3 -> t8x3x3
t5x4x2 -> t5x5x2
t8x2x2 -> t8x3x2
t6x5x4 -> t7x5x4
t7x2x2 -> t7x3x2
t7x6x6 -> t7x7x6
t7x5x3 -> t7x6x3
t5x3x2 -> t5x3x3
t7x4x3 -> t8x4x3
t6x4x2 -> t6x4x3
t4x4x2 -> t5x4x2
t5x5x2 -> t6x5x2
t4x3x2 -> t4x3x3
t6x3x2 -> t6x4x2
t7x5x2 -> t8x5x2
t8x4x2 -> t8x5x2
t7x5x5 -> t7x6x5
t8x6x3 -> t8x7x3
t7x6x4 -> t7x6x5
t7x7x7 -> t8x7x7
t7x3x2 -> t8x3x2
t2x2x2 -> t3x2x2
t4x4x4 -> t5x4x4
t7x4x3 -> t7x4x4
t7x5x3 -> t8x5x3
t8x8x4 -> t8x8x5
t5x2x2 -> t5x3x2
t4x4x3 -> t4x4x4
t8x7x6 -> t8x8x6
t3x3x2 -> t4x3x2
t6x6x3 -> t7x6x3
t8x5x5 -> t8x6x5
t6x6x5 -> t6x6x6
t8x4x2 -> t8x4x3
t5x5x4 -> t5x5x5
t8x6x6 -> t8x7x6
t6x5x5 -> t6x6x5
t8x6x2 -> t8x6x3
t7x5x4 -> t8x5x4
t7x6x5 -> t8x6x5
t8x8x6 -> t8x8x7
t7x5x3 -> t7x5x4
t6x5x4 -> t6x5x5
t8x5x4 -> t8x5x5
t7x6x2 -> t7x6x3
t8x7x4 -> t8x8x4
t5x5x3 -> t5x5x4
t5x4x3 -> t5x5x3
t6x4x3 -> t6x4x4
t7x6x4 -> t8x6x4
t7x7x3 -> t7x7x4
t6x6x6 -> t7x6x6
t4x2x2 -> t5x2x2
t6x5x3 -> t6x6x3
t7x4x2 -> t7x4x3
t4x3x3 -> t4x4x3
t6x4x4 -> t6x5x4
t6x5x5 -> t7x5x5
t6x5x4 -> t6x6x4
t8x7x2 -> t8x8x2
t6x4x3 -> t6x5x3
t8x7x4 -> t8x7x5
t7x7x5 -> t7x7x6
t8x8x7 -> t8x8x8
t7x4x4 -> t7x5x4
t6x6x4 -> t7x6x4
t8x5x3 -> t8x5x4
t7x7x2 -> t8x7x2
t7x6x5 -> t7x7x5
t3x3x2 -> t3x3x3
t7x3x3 -> t7x4x3
t8x7x2 -> t8x7x3
t6x6x5 -> t7x6x5
t4x2x2 -> t4x3x2
t5x5x3 -> t6x5x3
t5x4x3 -> t5x4x4
t4x3x3 -> t5x3x3
t6x3x2 -> t6x3x3
t7x3x2 -> t7x3x3
t6x5x3 -> t7x5x3
t7x7x3 -> t8x7x3
t7x7x6 -> t8x7x6
t4x4x3 -> t5x4x3
t6x6x2 -> t7x6x2
t8x5x2 -> t8x5x3
t7x4x4 -> t8x4x4
t8x6x3 -> t8x6x4
t8x4x3 -> t8x5x3
t5x4x2 -> t6x4x2
t6x2x2 -> t7x2x2
t7x4x2 -> t8x4x2
t3x2x2 -> t4x2x2
t8x3x2 -> t8x4x2
t6x3x3 -> t6x4x3
t7x4x3 -> t7x5x3
t8x7x5 -> t8x7x6
t7x2x2 -> t8x2x2
t8x6x4 -> t8x6x5
t7x7x6 -> t7x7x7
t5x3x3 -> t6x3x3
t6x2x2 -> t6x3x2
t8x6x2 -> t8x7x2
t6x5x2 -> t6x5x3
t7x3x2 -> t7x4x2
t6x4x2 -> t7x4x2
t4x4x2 -> t4x4x3
t8x6x5 -> t8x6x6
t6x6x4 -> t6x6x5
t8x7x3 -> t8x8x3
t5x5x4 -> t6x5x4
t5x4x2 -> t5x4x3
t8x5x4 -> t8x6x4
t5x5x2 -> t5x5x3
t6x3x3 -> t7x3x3
t3x3x3 -> t4x3x3
t8x6x4 -> t8x7x4
t6x5x2 -> t7x5x2
t5x2x2 -> t6x2x2
t8x4x3 -> t8x4x4
t5x3x2 -> t5x4x2
t7x5x4 -> t7x6x4
t7x6x2 -> t8x6x2
t6x6x3 -> t6x6x4
t6x4x4 -> t7x4x4
t8x7x3 -> t8x7x4
t7x6x2 -> t7x7x2
t7x7x5 -> t8x7x5
t7x4x2 -> t7x5x2
t7x5x2 -> t7x6x2
t7x7x2 -> t7x7x3
t7x6x4 -> t7x7x4
t7x6x6 -> t8x6x6
t7x6x5 -> t7x6x6
t7x5x4 -> t7x5x5
t4x3x2 -> t5x3x2
t8x8x5 -> t8x8x6
t6x6x2 -> t6x6x3
t6x4x3 -> t7x4x3
t7x6x3 -> t7x7x3
t8x7x7 -> t8x8x7
t8x5x3 -> t8x6x3
t6x3x2 -> t7x3x2
t5x4x4 -> t5x5x4
t8x8x3 -> t8x8x4
t8x7x6 -> t8x7x7
t8x5x2 -> t8x6x2
t3x2x2 -> t3x3x2
t7x6x3 -> t8x6x3
t8x3x3 -> t8x4x3
t5x3x3 -> t5x4x3
t7x5x5 -> t8x5x5
t7x7x4 -> t8x7x4
t8x7x5 -> t8x8x5
t8x3x2 -> t8x3x3
t8x8x2 -> t8x8x3
t6x4x2 -> t6x5x2
t5x4x3 -> t6x4x3
t5x5x5 -> t6x5x5
}

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 48 KiB

71
pom.xml Normal file
View file

@ -0,0 +1,71 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>nl.ou.utf8learner</groupId>
<artifactId>utf8learner</artifactId>
<version>1.0.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.release>11</maven.compiler.release>
</properties>
<dependencies>
<dependency>
<groupId>net.automatalib.distribution</groupId>
<artifactId>automata-distribution</artifactId>
<version>0.13.0-SNAPSHOT</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>de.learnlib.distribution</groupId>
<artifactId>learnlib-distribution</artifactId>
<version>0.19.0-SNAPSHOT</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.18.0</version>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>77.1</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.6.0</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>nl.ou.utf8learner.Main</mainClass>
</transformer>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>module-info.class</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

2320
results/apache.dot Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 5.9 KiB

30
results/full-partial.dot Normal file
View file

@ -0,0 +1,30 @@
digraph full {
s0 [shape="doublecircle" label="s0"];
s2 [shape="circle" label="s2"];
s3 [shape="circle" label="s3"];
s4 [shape="circle" label="s4"];
s5 [shape="circle" label="s5"];
s6 [shape="circle" label="s6"];
s7 [shape="circle" label="s7"];
s8 [shape="circle" label="s8"];
s0 -> s0 [label="0x00-0x7F"];
s0 -> s2 [label="0xC2-0xDF"];
s0 -> s4 [label="0xE0"];
s0 -> s3 [label="0xE1-0xEC, 0xEE-0xEF"];
s0 -> s5 [label="0xED"];
s0 -> s7 [label="0xF0"];
s0 -> s6 [label="0xF1-0xF3"];
s0 -> s8 [label="0xF4"];
s2 -> s0 [label="0x80-0xBF"];
s3 -> s2 [label="0x80-0xBF"];
s4 -> s2 [label="0xA0-0xBF"];
s5 -> s2 [label="0x80-0x9F"];
s6 -> s3 [label="0x80-0xBF"];
s7 -> s3 [label="0x90-0xBF"];
s8 -> s3 [label="0x80-0x8F"];
__start0 [label="" shape="none" width="0" height="0"];
__start0 -> s0;
}

BIN
results/full-partial.pdf Normal file

Binary file not shown.

49
results/full.dot Normal file
View file

@ -0,0 +1,49 @@
digraph full {
s0 [shape="doublecircle" label="s0"];
s1 [shape="circle" label="s1"];
s2 [shape="circle" label="s2"];
s3 [shape="circle" label="s3"];
s4 [shape="circle" label="s4"];
s5 [shape="circle" label="s5"];
s6 [shape="circle" label="s6"];
s7 [shape="circle" label="s7"];
s8 [shape="circle" label="s8"];
s0 -> s0 [label="0 -- 127"];
s0 -> s1 [label="-128 -- -63"];
s0 -> s2 [label="-62 -- -33"];
s0 -> s4 [label="-32"];
s0 -> s3 [label="-31 -- -20"];
s0 -> s5 [label="-19"];
s0 -> s3 [label="-18 -- -17"];
s0 -> s7 [label="-16"];
s0 -> s6 [label="-15 -- -13"];
s0 -> s8 [label="-12"];
s0 -> s1 [label="-11 -- -1"];
s1 -> s1 [label="0 -- -1"];
s2 -> s1 [label="0 -- 127"];
s2 -> s0 [label="-128 -- -65"];
s2 -> s1 [label="-64 -- -1"];
s3 -> s1 [label="0 -- 127"];
s3 -> s2 [label="-128 -- -65"];
s3 -> s1 [label="-64 -- -1"];
s4 -> s1 [label="0 -- -97"];
s4 -> s2 [label="-96 -- -65"];
s4 -> s1 [label="-64 -- -1"];
s5 -> s1 [label="0 -- 127"];
s5 -> s2 [label="-128 -- -97"];
s5 -> s1 [label="-96 -- -1"];
s6 -> s1 [label="0 -- 127"];
s6 -> s3 [label="-128 -- -65"];
s6 -> s1 [label="-64 -- -1"];
s7 -> s1 [label="0 -- -113"];
s7 -> s3 [label="-112 -- -65"];
s7 -> s1 [label="-64 -- -1"];
s8 -> s1 [label="0 -- 127"];
s8 -> s3 [label="-128 -- -113"];
s8 -> s1 [label="-112 -- -1"];
__start0 [label="" shape="none" width="0" height="0"];
__start0 -> s0;
}

BIN
results/full.pdf Normal file

Binary file not shown.

2320
results/guava.dot Normal file

File diff suppressed because it is too large Load diff

2320
results/java.dot Normal file

File diff suppressed because it is too large Load diff

7
run.sh Executable file
View file

@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -euxo pipefail
mvn package
time java -cp target/utf8learner-1.0.0-SNAPSHOT.jar nl.ou.utf8learner.Main

View file

@ -0,0 +1,163 @@
/*
* Copyright (c) 2025 Joshua Moerman, Open Universiteit
* SPDX-License-Identifier: EUPL-1.2
*/
package nl.ou.utf8learner;
import java.io.IOException;
import java.util.Collection;
import org.checkerframework.checker.nullness.qual.Nullable;
import com.google.common.collect.Lists;
import de.learnlib.acex.AcexAnalyzers;
import de.learnlib.algorithm.LearningAlgorithm.DFALearner;
import de.learnlib.algorithm.ttt.dfa.TTTLearnerDFA;
import de.learnlib.oracle.EquivalenceOracle;
import de.learnlib.oracle.EquivalenceOracle.DFAEquivalenceOracle;
import de.learnlib.oracle.MembershipOracle.DFAMembershipOracle;
import de.learnlib.oracle.equivalence.DFAEQOracleChain;
import de.learnlib.oracle.equivalence.DFARandomWpMethodEQOracle;
import de.learnlib.oracle.equivalence.DFAWMethodEQOracle;
import de.learnlib.query.DefaultQuery;
import de.learnlib.query.Query;
import de.learnlib.util.Experiment.DFAExperiment;
import net.automatalib.alphabet.impl.AbstractAlphabet;
import net.automatalib.automaton.fsa.DFA;
import net.automatalib.serialization.dot.GraphDOT;
import net.automatalib.word.Word;
public class Main {
public static void main(String[] args) throws IOException {
// Basic set-up. Normally one would use a Cache, but in this case
// the queries are much faster than the lookup. So no cache here.
ByteAlphabet alph = new ByteAlphabet();
DFAMembershipOracle<Byte> mqOracle = new UTF8MembershipOracle();
// We run three equivalence oracles in sequence: from fast to exhaustive.
// It is really only the W-method in the end which makes a lot of queries.
// But we want at least some guarantee of completeness :-).
DFAEquivalenceOracle<Byte> fixedTestSuite = new DFASampleSetEQOracle<>(getTestSuite(), mqOracle);
DFAEquivalenceOracle<Byte> randomWpMethod = new DFARandomWpMethodEQOracle<>(mqOracle, 0, 8, 5000);
DFAEquivalenceOracle<Byte> wmethod = new DFAWMethodEQOracle<>(mqOracle, 1);
DFAEquivalenceOracle<Byte> eqOracle = new DFAEQOracleChain<>(fixedTestSuite, randomWpMethod, wmethod);
// TTT is always a good default
DFALearner<Byte> learner = new TTTLearnerDFA<>(alph, mqOracle, AcexAnalyzers.BINARY_SEARCH_BWD);
DFAExperiment<Byte> experiment = new DFAExperiment<>(learner, eqOracle, alph);
experiment.run();
// Output result!
System.err.println("");
System.out.println(experiment.getRounds().getSummary());
GraphDOT.write(experiment.getFinalHypothesis(), alph, System.out);
}
// Implementing the membership oracle directly. We obtain a word of bytes,
// and assemble them into a byte array, which is then passed to the
// underlying implementation.
public static class UTF8MembershipOracle implements DFAMembershipOracle<Byte> {
private long count = 0;
@Override
public void processQuery(Query<Byte, Boolean> query) {
count++;
System.err.print("\r" + count);
Word<Byte> input = query.getInput();
byte[] bytearray = new byte[input.length()];
int i = 0;
for (Byte b : input) {
bytearray[i] = b;
i++;
}
boolean acc = UTF8SUL.accepts(bytearray);
query.answer(acc);
}
@Override
public void processQueries(Collection<? extends Query<Byte, Boolean>> queries) {
for (Query<Byte, Boolean> query : queries) {
processQuery(query);
}
}
}
// Alphabet of all 256 bytes. Implemented directly for efficiency.
public static class ByteAlphabet extends AbstractAlphabet<Byte> {
@Override
public Byte getSymbol(int index) {
return (byte) index;
}
@Override
public int getSymbolIndex(Byte symbol) {
if (symbol >= 0) {
return (int) symbol;
} else {
return (int) (symbol + 256);
}
}
@Override
public boolean containsSymbol(Byte symbol) {
return true;
}
@Override
public int size() {
return 256;
}
}
// For some reason the SampleSetEQOracle of LearnLib was not easy to
// use with DFAs. So I made my own.
public static class DFASampleSetEQOracle<I> implements EquivalenceOracle.DFAEquivalenceOracle<I> {
final private Collection<Word<I>> testSuite;
final private DFAMembershipOracle<I> truth;
public DFASampleSetEQOracle(Collection<Word<I>> testSuite, DFAMembershipOracle<I> truth) {
this.testSuite = testSuite;
this.truth = truth;
}
@Override
public @Nullable DefaultQuery<I, Boolean> findCounterExample(DFA<?, I> hypothesis,
Collection<? extends I> alphabet) {
for (Word<I> test : testSuite) {
DefaultQuery<I, Boolean> query = new DefaultQuery<>(test);
truth.processQuery(query);
if (hypothesis.accepts(test) != query.getOutput()) {
return query;
}
}
return null;
}
}
// This should cover all states, I think.
public static Collection<Word<Byte>> getTestSuite() {
return Lists.newArrayList(
Word.fromSymbols((byte) -15, (byte) 64),
Word.fromSymbols((byte) -62, (byte) -65),
Word.fromSymbols((byte) -33, (byte) 127),
Word.fromSymbols((byte) -32, (byte) 0),
Word.fromSymbols((byte) -12, (byte) -112),
Word.fromSymbols((byte) -18, (byte) -100, (byte) 0),
Word.fromSymbols((byte) -32, (byte) -70, (byte) -120),
Word.fromSymbols((byte) -19, (byte) -97, (byte) -1),
Word.fromSymbols((byte) -31, (byte) -128, (byte) -100),
Word.fromSymbols((byte) -31, (byte) -128, (byte) 127),
Word.fromSymbols((byte) -16, (byte) -112, (byte) -64),
Word.fromSymbols((byte) -18, (byte) -100, (byte) -100),
Word.fromSymbols((byte) -19, (byte) -97, (byte) -90),
Word.fromSymbols((byte) 0xEF, (byte) 0xBB, (byte) 0xBF),
Word.fromSymbols((byte) -16, (byte) -112, (byte) -128, (byte) -128),
Word.fromSymbols((byte) -12, (byte) -120, (byte) -65, (byte) -80),
Word.fromSymbols((byte) -15, (byte) -65, (byte) -65, (byte) -65),
Word.fromSymbols((byte) -15, (byte) -128, (byte) -128, (byte) -128));
}
}

View file

@ -0,0 +1,27 @@
/*
* Copyright (c) 2025 Joshua Moerman, Open Universiteit
* SPDX-License-Identifier: EUPL-1.2
*/
package nl.ou.utf8learner;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.CharacterCodingException;
public class UTF8SUL {
public static boolean accepts(byte[] data) {
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPORT);
decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
try {
decoder.decode(ByteBuffer.wrap(data));
return true;
} catch (CharacterCodingException e) {
return false;
}
}
}

View file

@ -0,0 +1,20 @@
/*
* Copyright (c) 2025 Joshua Moerman, Open Universiteit
* SPDX-License-Identifier: EUPL-1.2
*/
package nl.ou.utf8learner;
import java.util.Arrays;
import org.apache.commons.codec.binary.StringUtils;
public class UTF8SULApache {
public static boolean accepts(byte[] data) {
try {
String s = StringUtils.newStringUtf8(data);
return Arrays.equals(data, StringUtils.getBytesUtf8(s));
} catch (Exception e) {
return false;
}
}
}

View file

@ -0,0 +1,14 @@
/*
* Copyright (c) 2025 Joshua Moerman, Open Universiteit
* SPDX-License-Identifier: EUPL-1.2
*/
package nl.ou.utf8learner;
import com.google.common.base.Utf8;
public class UTF8SULGuava {
public static boolean accepts(byte[] data) {
return Utf8.isWellFormed(data);
}
}

View file

@ -0,0 +1,30 @@
/*
* Copyright (c) 2025 Joshua Moerman, Open Universiteit
* SPDX-License-Identifier: EUPL-1.2
*/
package nl.ou.utf8learner;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
public class UTF8SULICU4J {
public static boolean accepts(byte[] data) {
// The CharsetDetector is not a good validator, it accepts a certain
// amount of errors. And it doesn't always report short strings as
// valid UTF-8.
CharsetDetector detector = new CharsetDetector();
detector.setDeclaredEncoding("UTF-8");
detector.setText(data);
CharsetMatch[] matches = detector.detectAll();
for (CharsetMatch match : matches) {
// The confidence can be either 15, 25, 80 or 100.
if ("UTF-8".equalsIgnoreCase(match.getName()) && match.getConfidence() >= 100) {
return true;
}
}
return false;
}
}