utf8-learner/src/main/java/nl/ou/utf8learner/UTF8SULICU4J.java

/*
 * Copyright (c) 2025 Joshua Moerman, Open Universiteit
 * SPDX-License-Identifier: EUPL-1.2
 */

package nl.ou.utf8learner;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;

public class UTF8SULICU4J {
  public static boolean accepts(byte[] data) {
    // The CharsetDetector is not a good validator, it accepts a certain
    // amount of errors. And it doesn't always report short strings as
    // valid UTF-8.
    CharsetDetector detector = new CharsetDetector();
    detector.setDeclaredEncoding("UTF-8");
    detector.setText(data);

    CharsetMatch[] matches = detector.detectAll();
    for (CharsetMatch match : matches) {
      // The confidence can be either 15, 25, 80 or 100.
      if ("UTF-8".equalsIgnoreCase(match.getName()) && match.getConfidence() >= 100) {
        return true;
      }
    }

    return false;
  }
}