Skip to content

Commit 3f1a5eb

Browse files
committed
Improve support for non-BMP characters in XML.
This adds support for writing surrogate pairs out as entity references in KXmlSerializer and for parsing non-BMP entity references in DocumentBuilderImpl. Emoji and XML. Two of my least favorite things together at last. Bug: 17960630 Change-Id: If5e1001faf250e87e6eeebe3449a6ebc115789a1
1 parent 73f5c01 commit 3f1a5eb

File tree

3 files changed

+110
-28
lines changed

3 files changed

+110
-28
lines changed

luni/src/main/java/org/apache/harmony/xml/parsers/DocumentBuilderImpl.java

+6-4
Original file line numberDiff line numberDiff line change
@@ -416,11 +416,13 @@ private String resolvePredefinedOrCharacterEntity(String entityName) {
416416

417417
private String resolveCharacterReference(String value, int base) {
418418
try {
419-
int ch = Integer.parseInt(value, base);
420-
if (ch < 0 || ch > Character.MAX_VALUE) {
421-
return null;
419+
int codePoint = Integer.parseInt(value, base);
420+
if (Character.isBmpCodePoint(codePoint)) {
421+
return String.valueOf((char) codePoint);
422+
} else {
423+
char[] surrogatePair = Character.toChars(codePoint);
424+
return new String(surrogatePair);
422425
}
423-
return String.valueOf((char) ch);
424426
} catch (NumberFormatException ex) {
425427
return null;
426428
}

luni/src/test/java/libcore/xml/KxmlSerializerTest.java

+66-9
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222
import junit.framework.TestCase;
2323
import org.kxml2.io.KXmlSerializer;
2424
import org.w3c.dom.Document;
25+
import org.w3c.dom.Node;
2526
import org.w3c.dom.NodeList;
27+
import org.w3c.dom.Text;
2628
import org.xmlpull.v1.XmlSerializer;
2729
import static tests.support.Support_Xml.domOf;
2830

@@ -87,12 +89,67 @@ private static XmlSerializer newSerializer() throws IOException {
8789
return serializer;
8890
}
8991

92+
public String fromCodePoint(int codePoint) {
93+
if (codePoint > Character.MAX_VALUE) {
94+
return new String(Character.toChars(codePoint));
95+
}
96+
return Character.toString((char) codePoint);
97+
}
98+
99+
// http://b/17960630
100+
public void testSpeakNoEvilMonkeys() throws Exception {
101+
StringWriter stringWriter = new StringWriter();
102+
XmlSerializer serializer = new KXmlSerializer();
103+
serializer.setOutput(stringWriter);
104+
serializer.startDocument("UTF-8", null);
105+
serializer.startTag(NAMESPACE, "tag");
106+
serializer.attribute(NAMESPACE, "attr", "a\ud83d\ude4ab");
107+
serializer.text("c\ud83d\ude4ad");
108+
serializer.cdsect("e\ud83d\ude4af");
109+
serializer.endTag(NAMESPACE, "tag");
110+
serializer.endDocument();
111+
assertXmlEquals("<tag attr=\"a&#128586;b\">" +
112+
"c&#128586;d" +
113+
"<![CDATA[e]]>&#128586;<![CDATA[f]]>" +
114+
"</tag>", stringWriter.toString());
115+
116+
// Check we can parse what we just output.
117+
Document doc = domOf(stringWriter.toString());
118+
Node root = doc.getDocumentElement();
119+
assertEquals("a\ud83d\ude4ab", root.getAttributes().getNamedItem("attr").getNodeValue());
120+
Text text = (Text) root.getFirstChild();
121+
assertEquals("c\ud83d\ude4ade\ud83d\ude4af", text.getNodeValue());
122+
}
123+
124+
public void testBadSurrogates() throws Exception {
125+
StringWriter stringWriter = new StringWriter();
126+
XmlSerializer serializer = new KXmlSerializer();
127+
serializer.setOutput(stringWriter);
128+
serializer.startDocument("UTF-8", null);
129+
serializer.startTag(NAMESPACE, "tag");
130+
try {
131+
serializer.attribute(NAMESPACE, "attr", "a\ud83d\u0040b");
132+
} catch (IllegalArgumentException expected) {
133+
}
134+
try {
135+
serializer.text("c\ud83d\u0040d");
136+
} catch (IllegalArgumentException expected) {
137+
}
138+
try {
139+
serializer.cdsect("e\ud83d\u0040f");
140+
} catch (IllegalArgumentException expected) {
141+
}
142+
}
143+
144+
// Cover all the BMP code points plus a few that require us to use surrogates.
145+
private static int MAX_TEST_CODE_POINT = 0x10008;
146+
90147
public void testInvalidCharactersInText() throws IOException {
91148
XmlSerializer serializer = newSerializer();
92149
serializer.startTag(NAMESPACE, "root");
93-
for (int ch = 0; ch <= 0xffff; ++ch) {
94-
final String s = Character.toString((char) ch);
95-
if (isValidXmlCodePoint(ch)) {
150+
for (int c = 0; c <= MAX_TEST_CODE_POINT; ++c) {
151+
final String s = fromCodePoint(c);
152+
if (isValidXmlCodePoint(c)) {
96153
serializer.text("a" + s + "b");
97154
} else {
98155
try {
@@ -108,9 +165,9 @@ public void testInvalidCharactersInText() throws IOException {
108165
public void testInvalidCharactersInAttributeValues() throws IOException {
109166
XmlSerializer serializer = newSerializer();
110167
serializer.startTag(NAMESPACE, "root");
111-
for (int ch = 0; ch <= 0xffff; ++ch) {
112-
final String s = Character.toString((char) ch);
113-
if (isValidXmlCodePoint(ch)) {
168+
for (int c = 0; c <= MAX_TEST_CODE_POINT; ++c) {
169+
final String s = fromCodePoint(c);
170+
if (isValidXmlCodePoint(c)) {
114171
serializer.attribute(NAMESPACE, "a", "a" + s + "b");
115172
} else {
116173
try {
@@ -126,9 +183,9 @@ public void testInvalidCharactersInAttributeValues() throws IOException {
126183
public void testInvalidCharactersInCdataSections() throws IOException {
127184
XmlSerializer serializer = newSerializer();
128185
serializer.startTag(NAMESPACE, "root");
129-
for (int ch = 0; ch <= 0xffff; ++ch) {
130-
final String s = Character.toString((char) ch);
131-
if (isValidXmlCodePoint(ch)) {
186+
for (int c = 0; c <= MAX_TEST_CODE_POINT; ++c) {
187+
final String s = fromCodePoint(c);
188+
if (isValidXmlCodePoint(c)) {
132189
serializer.cdsect("a" + s + "b");
133190
} else {
134191
try {

xml/src/main/java/org/kxml2/io/KXmlSerializer.java

+38-15
Original file line numberDiff line numberDiff line change
@@ -125,14 +125,18 @@ private final void writeEscaped(String s, int quot) throws IOException {
125125
// otherwise generate.
126126
// Note: tab, newline, and carriage return have already been
127127
// handled above.
128-
boolean valid = (c >= 0x20 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xfffd);
129-
if (!valid) {
130-
reportInvalidCharacter(c);
131-
}
132-
if (unicode || c < 127) {
133-
writer.write(c);
128+
boolean allowedInXml = (c >= 0x20 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xfffd);
129+
if (allowedInXml) {
130+
if (unicode || c < 127) {
131+
writer.write(c);
132+
} else {
133+
writer.write("&#" + ((int) c) + ";");
134+
}
135+
} else if (Character.isHighSurrogate(c) && i < s.length() - 1) {
136+
writeSurrogate(c, s.charAt(i + 1));
137+
++i;
134138
} else {
135-
writer.write("&#" + ((int) c) + ";");
139+
reportInvalidCharacter(c);
136140
}
137141
// END android-changed
138142
}
@@ -141,7 +145,7 @@ private final void writeEscaped(String s, int quot) throws IOException {
141145

142146
// BEGIN android-added
143147
private static void reportInvalidCharacter(char ch) {
144-
throw new IllegalArgumentException("Illegal character (" + Integer.toHexString((int) ch) + ")");
148+
throw new IllegalArgumentException("Illegal character (U+" + Integer.toHexString((int) ch) + ")");
145149
}
146150
// END android-added
147151

@@ -548,22 +552,41 @@ public void cdsect(String data) throws IOException {
548552
// BEGIN android-changed: ]]> is not allowed within a CDATA,
549553
// so break and start a new one when necessary.
550554
data = data.replace("]]>", "]]]]><![CDATA[>");
551-
char[] chars = data.toCharArray();
552-
// We also aren't allowed any invalid characters.
553-
for (char ch : chars) {
554-
boolean valid = (ch >= 0x20 && ch <= 0xd7ff) ||
555+
writer.write("<![CDATA[");
556+
for (int i = 0; i < data.length(); ++i) {
557+
char ch = data.charAt(i);
558+
boolean allowedInCdata = (ch >= 0x20 && ch <= 0xd7ff) ||
555559
(ch == '\t' || ch == '\n' || ch == '\r') ||
556560
(ch >= 0xe000 && ch <= 0xfffd);
557-
if (!valid) {
561+
if (allowedInCdata) {
562+
writer.write(ch);
563+
} else if (Character.isHighSurrogate(ch) && i < data.length() - 1) {
564+
// Character entities aren't valid in CDATA, so break out for this.
565+
writer.write("]]>");
566+
writeSurrogate(ch, data.charAt(++i));
567+
writer.write("<![CDATA[");
568+
} else {
558569
reportInvalidCharacter(ch);
559570
}
560571
}
561-
writer.write("<![CDATA[");
562-
writer.write(chars, 0, chars.length);
563572
writer.write("]]>");
564573
// END android-changed
565574
}
566575

576+
// BEGIN android-added
577+
private void writeSurrogate(char high, char low) throws IOException {
578+
if (!Character.isLowSurrogate(low)) {
579+
throw new IllegalArgumentException("Bad surrogate pair (U+" + Integer.toHexString((int) high) +
580+
" U+" + Integer.toHexString((int) low) + ")");
581+
}
582+
// Java-style surrogate pairs aren't allowed in XML. We could use the > 3-byte encodings, but that
583+
// seems likely to upset anything expecting modified UTF-8 rather than "real" UTF-8. It seems more
584+
// conservative in a Java environment to use an entity reference instead.
585+
int codePoint = Character.toCodePoint(high, low);
586+
writer.write("&#" + codePoint + ";");
587+
}
588+
// END android-added
589+
567590
public void comment(String comment) throws IOException {
568591
check(false);
569592
writer.write("<!--");

0 commit comments

Comments
 (0)