JDK 9学习笔记 - (2)能屈能伸的String



  1. jdk/src/java.base/share/native/libjava/String.c
  2. jdk/src/java.base/share/classes/java/lang/String.java
  3. jdk/src/java.base/share/classes/java/lang/StringLatin1.java
  4. jdk/src/java.base/share/classes/java/lang/StringUTF16.java


无论是何种语言的何种实现,String本质上都是字节序列,所有可能的字符加起来就构成了字符集,给字符集中每个字符一个序号就是字符编码,使用最广泛的就是Unicode了,它几乎支持地球上所有常见文字,Unicode有三种最主要的实现,UTF-8,UTF-16还有UTF-32,在web领域,UTF-8已经处于绝对垄断地位。Java 9的String,引入了类似Python str的压缩功能。原理很简单,如果String只包含Latin1字符,1字节存一个字符够用了,如果String含有中文,那么就换一种编码方式存储,用一个变量表示当前的字符集就行了。先看三个类。

public final class String
    implements java.io.Serializable, Comparable<String>, CharSequence {

    /**  * The value is used for character storage.  *  * @implNote This field is trusted by the VM, and is a subject to  * constant folding if String instance is constant. Overwriting this  * field after construction will cause problems.  *  * Additionally, it is marked with {@link Stable} to trust the contents  * of the array. No other facility in JDK provides this functionality (yet).  * {@link Stable} is safe here, because value is never null.  */
    private final byte[] value;

    /**  * The identifier of the encoding used to encode the bytes in  * {@code value}. The supported values in this implementation are  *  * LATIN1  * UTF16  *  * @implNote This field is trusted by the VM, and is a subject to  * constant folding if String instance is constant. Overwriting this  * field after construction will cause problems.  */
    private final byte coder;

    /** Cache the hash code for the string */
    private int hash; // Default to 0 }

final class StringLatin1 {

final class StringUTF16 {








    public char charAt(int index) {
        if (isLatin1()) {
            return StringLatin1.charAt(value, index);
        } else {
            return StringUTF16.charAt(value, index);


    // Latin1     public static char charAt(byte[] value, int index) {
        if (index < 0 || index >= value.length) {
            throw new StringIndexOutOfBoundsException(index);
        return (char)(value[index] & 0xff);

    // UTF-16     public static char charAt(byte[] value, int index) {
        checkIndex(index, value);
        return getChar(value, index);

    static char getChar(byte[] val, int index) {
        assert index >= 0 && index < length(val) : "Trusted caller missed bounds check";
        index <<= 1;
        return (char)(((val[index++] & 0xff) << HI_BYTE_SHIFT) |
                      ((val[index]   & 0xff) << LO_BYTE_SHIFT));

    private static native boolean isBigEndian();

    static final int HI_BYTE_SHIFT;
    static final int LO_BYTE_SHIFT;
    static {
        if (isBigEndian()) {
            HI_BYTE_SHIFT = 8;
            LO_BYTE_SHIFT = 0;
        } else {
            HI_BYTE_SHIFT = 0;
            LO_BYTE_SHIFT = 8;




    // String     public int codePointAt(int index) {
        if (isLatin1()) {
            checkIndex(index, value.length);
            return value[index] & 0xff;
        int length = value.length >> 1;
        checkIndex(index, length);
        return StringUTF16.codePointAt(value, index, length);

    // UTF-16     public static int codePointAt(byte[] value, int index, int end) {
       return codePointAt(value, index, end, false /* unchecked */);

    // UTF-16     private static int codePointAt(byte[] value, int index, int end, boolean checked) {
        assert index < end;
        if (checked) {
            checkIndex(index, value);
        char c1 = getChar(value, index);
        if (Character.isHighSurrogate(c1) && ++index < end) {
            if (checked) {
                checkIndex(index, value);
            char c2 = getChar(value, index);
            if (Character.isLowSurrogate(c2)) {
               return Character.toCodePoint(c1, c2);
        return c1;





可能我们都很熟悉KMP或者自动机等高级字符串查找算法,然而在Java的String里,我们并没有用到这种高级算法,JDK String选择的是最简易的暴力查找,现实中的字符串查找,很少出现大量前缀字符匹配但最后失配的情况。比如这篇博文,任意拿10个字符的字串出来查找,都不会遇到前9个匹配但第10个不匹配的情况。但是这种朴素算法,可能会遭受到精心构造的数据的攻击,显著增加计算量。从O(m)变成O(m*n),m是完整的字符串长度,n是要查找的字串的长度。在极端情况下,我们需要防范这种攻击,但平时一般不用考虑。


有了查找,自然也有了替换,替换 = 查找 + 拼接,单个字符的替换比较简单,但是需要考虑到Latin1变UTF-16的情况,也要考虑UTF-16压缩到Latin1的情况。



A string literal is a reference to an instance of class String (§4.3.1, §4.3.3).

Moreover, a string literal always refers to the same instance of class String . This

is because string literals – or, more generally, strings that are the values of constant

expressions (§15.28) – are “interned” so as to share unique instances, using the

method String.intern .


JVM_ENTRY(jstring, JVM_InternString(JNIEnv *env, jstring str))
  JvmtiVMObjectAllocEventCollector oam;
  if (str == NULL) return NULL;
  oop string = JNIHandles::resolve_non_null(str);
  oop result = StringTable::intern(string, CHECK_NULL);
  return (jstring) JNIHandles::make_local(env, result);

oop StringTable::intern(oop string, TRAPS)
  if (string == NULL) return NULL;
  ResourceMark rm(THREAD);
  int length;
  Handle h_string (THREAD, string);
  jchar* chars = java_lang_String::as_unicode_string(string, length, CHECK_NULL);
  oop result = intern(h_string, chars, length, CHECK_NULL);
  return result;

oop StringTable::intern(Handle string_or_null, jchar* name,
                        int len, TRAPS) {
  // shared table always uses java_lang_String::hash_code   unsigned int hashValue = java_lang_String::hash_code(name, len);
  oop found_string = lookup_shared(name, len, hashValue);
  if (found_string != NULL) {
    return found_string;
  if (use_alternate_hashcode()) {
    hashValue = alt_hash_string(name, len);
  int index = the_table()->hash_to_index(hashValue);
  found_string = the_table()->lookup_in_main_table(index, name, len, hashValue);

  // Found   if (found_string != NULL) {
    if (found_string != string_or_null()) {
    return found_string;

  debug_only(StableMemoryChecker smc(name, len * sizeof(name[0])));
         "proposed name of symbol must be stable");

  Handle string;
  // try to reuse the string if possible   if (!string_or_null.is_null()) {
    string = string_or_null;
  } else {
    string = java_lang_String::create_from_unicode(name, len, CHECK_NULL);

#if INCLUDE_ALL_GCS   if (G1StringDedup::is_enabled()) {
    // Deduplicate the string before it is interned. Note that we should never     // deduplicate a string after it has been interned. Doing so will counteract     // compiler optimizations done on e.g. interned string literals.     G1StringDedup::deduplicate(string());
  // Grab the StringTable_lock before getting the_table() because it could   // change at safepoint.   oop added_or_found;
    MutexLocker ml(StringTable_lock, THREAD);
    // Otherwise, add to symbol to table     added_or_found = the_table()->basic_add(index, string, name, len,
                                  hashValue, CHECK_NULL);

  if (added_or_found != string()) {

  return added_or_found;



JDK 9 String 的 compress 设计,在Latin1字符为主的程序里,可以把String占用的内存减少一半,小小的改进,体现到代码层面却是到处都要把这个放心上处理。天下没有免费午餐,这个特性在节省内存的同时引入了编码检测的开销,有时反而会更慢。如果确信关闭这个特性更好,JVM 9提供了参数可以关闭这个特性,+XX:-CompactStrings。

    原文地址: https://zhuanlan.zhihu.com/p/30584322