mirror of
https://git.cs.ou.nl/joshua.moerman/utf8-learner.git
synced 2025-07-01 14:17:45 +02:00
30 lines
870 B
Java
30 lines
870 B
Java
/*
|
|
* Copyright (c) 2025 Joshua Moerman, Open Universiteit
|
|
* SPDX-License-Identifier: EUPL-1.2
|
|
*/
|
|
|
|
package nl.ou.utf8learner;
|
|
|
|
import com.ibm.icu.text.CharsetDetector;
|
|
import com.ibm.icu.text.CharsetMatch;
|
|
|
|
public class UTF8SULICU4J {
|
|
public static boolean accepts(byte[] data) {
|
|
// The CharsetDetector is not a good validator, it accepts a certain
|
|
// amount of errors. And it doesn't always report short strings as
|
|
// valid UTF-8.
|
|
CharsetDetector detector = new CharsetDetector();
|
|
detector.setDeclaredEncoding("UTF-8");
|
|
detector.setText(data);
|
|
|
|
CharsetMatch[] matches = detector.detectAll();
|
|
for (CharsetMatch match : matches) {
|
|
// The confidence can be either 15, 25, 80 or 100.
|
|
if ("UTF-8".equalsIgnoreCase(match.getName()) && match.getConfidence() >= 100) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
}
|