1
Fork 0
mirror of https://git.cs.ou.nl/joshua.moerman/utf8-learner.git synced 2025-07-01 14:17:45 +02:00
utf8-learner/src/main/java/nl/ou/utf8learner/UTF8SULICU4J.java
2025-06-13 13:21:37 +02:00

30 lines
870 B
Java

/*
* Copyright (c) 2025 Joshua Moerman, Open Universiteit
* SPDX-License-Identifier: EUPL-1.2
*/
package nl.ou.utf8learner;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
public class UTF8SULICU4J {
public static boolean accepts(byte[] data) {
// The CharsetDetector is not a good validator, it accepts a certain
// amount of errors. And it doesn't always report short strings as
// valid UTF-8.
CharsetDetector detector = new CharsetDetector();
detector.setDeclaredEncoding("UTF-8");
detector.setText(data);
CharsetMatch[] matches = detector.detectAll();
for (CharsetMatch match : matches) {
// The confidence can be either 15, 25, 80 or 100.
if ("UTF-8".equalsIgnoreCase(match.getName()) && match.getConfidence() >= 100) {
return true;
}
}
return false;
}
}