Improve support for non-BMP characters in XML.

enh-google · enh-google · commit 3f1a5ebc337e · 2014-10-18T13:29:39.000-07:00
This adds support for writing surrogate pairs out as entity references in
KXmlSerializer and for parsing non-BMP entity references in
DocumentBuilderImpl.

Emoji and XML. Two of my least favorite things together at last.

Bug: 17960630
Change-Id: If5e1001faf250e87e6eeebe3449a6ebc115789a1
diff --git a/luni/src/main/java/org/apache/harmony/xml/parsers/DocumentBuilderImpl.java b/luni/src/main/java/org/apache/harmony/xml/parsers/DocumentBuilderImpl.java
@@ -416,11 +416,13 @@ private String resolvePredefinedOrCharacterEntity(String entityName) {
 
     private String resolveCharacterReference(String value, int base) {
         try {
-            int ch = Integer.parseInt(value, base);
-            if (ch < 0 || ch > Character.MAX_VALUE) {
-                return null;
+            int codePoint = Integer.parseInt(value, base);
+            if (Character.isBmpCodePoint(codePoint)) {
+                return String.valueOf((char) codePoint);
+            } else {
+                char[] surrogatePair = Character.toChars(codePoint);
+                return new String(surrogatePair);
             }
-            return String.valueOf((char) ch);
         } catch (NumberFormatException ex) {
             return null;
         }
diff --git a/luni/src/test/java/libcore/xml/KxmlSerializerTest.java b/luni/src/test/java/libcore/xml/KxmlSerializerTest.java
@@ -22,7 +22,9 @@
 import junit.framework.TestCase;
 import org.kxml2.io.KXmlSerializer;
 import org.w3c.dom.Document;
+import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
 import org.xmlpull.v1.XmlSerializer;
 import static tests.support.Support_Xml.domOf;
 
@@ -87,12 +89,67 @@ private static XmlSerializer newSerializer() throws IOException {
         return serializer;
     }
 
+    public String fromCodePoint(int codePoint) {
+        if (codePoint > Character.MAX_VALUE) {
+            return new String(Character.toChars(codePoint));
+        }
+        return Character.toString((char) codePoint);
+    }
+
+    // http://b/17960630
+    public void testSpeakNoEvilMonkeys() throws Exception {
+        StringWriter stringWriter = new StringWriter();
+        XmlSerializer serializer = new KXmlSerializer();
+        serializer.setOutput(stringWriter);
+        serializer.startDocument("UTF-8", null);
+        serializer.startTag(NAMESPACE, "tag");
+        serializer.attribute(NAMESPACE, "attr", "a\ud83d\ude4ab");
+        serializer.text("c\ud83d\ude4ad");
+        serializer.cdsect("e\ud83d\ude4af");
+        serializer.endTag(NAMESPACE, "tag");
+        serializer.endDocument();
+        assertXmlEquals("<tag attr=\"a&#128586;b\">" +
+                        "c&#128586;d" +
+                        "<![CDATA[e]]>&#128586;<![CDATA[f]]>" +
+                        "</tag>", stringWriter.toString());
+
+        // Check we can parse what we just output.
+        Document doc = domOf(stringWriter.toString());
+        Node root = doc.getDocumentElement();
+        assertEquals("a\ud83d\ude4ab", root.getAttributes().getNamedItem("attr").getNodeValue());
+        Text text = (Text) root.getFirstChild();
+        assertEquals("c\ud83d\ude4ade\ud83d\ude4af", text.getNodeValue());
+    }
+
+    public void testBadSurrogates() throws Exception {
+        StringWriter stringWriter = new StringWriter();
+        XmlSerializer serializer = new KXmlSerializer();
+        serializer.setOutput(stringWriter);
+        serializer.startDocument("UTF-8", null);
+        serializer.startTag(NAMESPACE, "tag");
+        try {
+            serializer.attribute(NAMESPACE, "attr", "a\ud83d\u0040b");
+        } catch (IllegalArgumentException expected) {
+        }
+        try {
+            serializer.text("c\ud83d\u0040d");
+        } catch (IllegalArgumentException expected) {
+        }
+        try {
+            serializer.cdsect("e\ud83d\u0040f");
+        } catch (IllegalArgumentException expected) {
+        }
+    }
+
+    // Cover all the BMP code points plus a few that require us to use surrogates.
+    private static int MAX_TEST_CODE_POINT = 0x10008;
+
     public void testInvalidCharactersInText() throws IOException {
         XmlSerializer serializer = newSerializer();
         serializer.startTag(NAMESPACE, "root");
-        for (int ch = 0; ch <= 0xffff; ++ch) {
-            final String s = Character.toString((char) ch);
-            if (isValidXmlCodePoint(ch)) {
+        for (int c = 0; c <= MAX_TEST_CODE_POINT; ++c) {
+            final String s = fromCodePoint(c);
+            if (isValidXmlCodePoint(c)) {
                 serializer.text("a" + s + "b");
             } else {
                 try {
@@ -108,9 +165,9 @@ public void testInvalidCharactersInText() throws IOException {
     public void testInvalidCharactersInAttributeValues() throws IOException {
         XmlSerializer serializer = newSerializer();
         serializer.startTag(NAMESPACE, "root");
-        for (int ch = 0; ch <= 0xffff; ++ch) {
-            final String s = Character.toString((char) ch);
-            if (isValidXmlCodePoint(ch)) {
+        for (int c = 0; c <= MAX_TEST_CODE_POINT; ++c) {
+            final String s = fromCodePoint(c);
+            if (isValidXmlCodePoint(c)) {
                 serializer.attribute(NAMESPACE, "a", "a" + s + "b");
             } else {
                 try {
@@ -126,9 +183,9 @@ public void testInvalidCharactersInAttributeValues() throws IOException {
     public void testInvalidCharactersInCdataSections() throws IOException {
         XmlSerializer serializer = newSerializer();
         serializer.startTag(NAMESPACE, "root");
-        for (int ch = 0; ch <= 0xffff; ++ch) {
-            final String s = Character.toString((char) ch);
-            if (isValidXmlCodePoint(ch)) {
+        for (int c = 0; c <= MAX_TEST_CODE_POINT; ++c) {
+            final String s = fromCodePoint(c);
+            if (isValidXmlCodePoint(c)) {
                 serializer.cdsect("a" + s + "b");
             } else {
                 try {
diff --git a/xml/src/main/java/org/kxml2/io/KXmlSerializer.java b/xml/src/main/java/org/kxml2/io/KXmlSerializer.java
@@ -125,14 +125,18 @@ private final void writeEscaped(String s, int quot) throws IOException {
                     // otherwise generate.
                     // Note: tab, newline, and carriage return have already been
                     // handled above.
-                    boolean valid = (c >= 0x20 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xfffd);
-                    if (!valid) {
-                        reportInvalidCharacter(c);
-                    }
-                    if (unicode || c < 127) {
-                        writer.write(c);
+                    boolean allowedInXml = (c >= 0x20 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xfffd);
+                    if (allowedInXml) {
+                        if (unicode || c < 127) {
+                            writer.write(c);
+                        } else {
+                            writer.write("&#" + ((int) c) + ";");
+                        }
+                    } else if (Character.isHighSurrogate(c) && i < s.length() - 1) {
+                        writeSurrogate(c, s.charAt(i + 1));
+                        ++i;
                     } else {
-                        writer.write("&#" + ((int) c) + ";");
+                        reportInvalidCharacter(c);
                     }
                     // END android-changed
             }
@@ -141,7 +145,7 @@ private final void writeEscaped(String s, int quot) throws IOException {
 
     // BEGIN android-added
     private static void reportInvalidCharacter(char ch) {
-        throw new IllegalArgumentException("Illegal character (" + Integer.toHexString((int) ch) + ")");
+        throw new IllegalArgumentException("Illegal character (U+" + Integer.toHexString((int) ch) + ")");
     }
     // END android-added
 
@@ -548,22 +552,41 @@ public void cdsect(String data) throws IOException {
         // BEGIN android-changed: ]]> is not allowed within a CDATA,
         // so break and start a new one when necessary.
         data = data.replace("]]>", "]]]]><![CDATA[>");
-        char[] chars = data.toCharArray();
-        // We also aren't allowed any invalid characters.
-        for (char ch : chars) {
-            boolean valid = (ch >= 0x20 && ch <= 0xd7ff) ||
+        writer.write("<![CDATA[");
+        for (int i = 0; i < data.length(); ++i) {
+            char ch = data.charAt(i);
+            boolean allowedInCdata = (ch >= 0x20 && ch <= 0xd7ff) ||
                     (ch == '\t' || ch == '\n' || ch == '\r') ||
                     (ch >= 0xe000 && ch <= 0xfffd);
-            if (!valid) {
+            if (allowedInCdata) {
+                writer.write(ch);
+            } else if (Character.isHighSurrogate(ch) && i < data.length() - 1) {
+                // Character entities aren't valid in CDATA, so break out for this.
+                writer.write("]]>");
+                writeSurrogate(ch, data.charAt(++i));
+                writer.write("<![CDATA[");
+            } else {
                 reportInvalidCharacter(ch);
             }
         }
-        writer.write("<![CDATA[");
-        writer.write(chars, 0, chars.length);
         writer.write("]]>");
         // END android-changed
     }
 
+    // BEGIN android-added
+    private void writeSurrogate(char high, char low) throws IOException {
+        if (!Character.isLowSurrogate(low)) {
+            throw new IllegalArgumentException("Bad surrogate pair (U+" + Integer.toHexString((int) high) +
+                                               " U+" + Integer.toHexString((int) low) + ")");
+        }
+        // Java-style surrogate pairs aren't allowed in XML. We could use the > 3-byte encodings, but that
+        // seems likely to upset anything expecting modified UTF-8 rather than "real" UTF-8. It seems more
+        // conservative in a Java environment to use an entity reference instead.
+        int codePoint = Character.toCodePoint(high, low);
+        writer.write("&#" + codePoint + ";");
+    }
+    // END android-added
+
     public void comment(String comment) throws IOException {
         check(false);
         writer.write("<!--");