/* * Copyright (c) 2025 Joshua Moerman, Open Universiteit * SPDX-License-Identifier: EUPL-1.2 */ package nl.ou.utf8learner; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; public class UTF8SULICU4J { public static boolean accepts(byte[] data) { // The CharsetDetector is not a good validator, it accepts a certain // amount of errors. And it doesn't always report short strings as // valid UTF-8. CharsetDetector detector = new CharsetDetector(); detector.setDeclaredEncoding("UTF-8"); detector.setText(data); CharsetMatch[] matches = detector.detectAll(); for (CharsetMatch match : matches) { // The confidence can be either 15, 25, 80 or 100. if ("UTF-8".equalsIgnoreCase(match.getName()) && match.getConfidence() >= 100) { return true; } } return false; } }