diff --git a/src/org/sosy_lab/java_smt/api/StringFormulaManager.java b/src/org/sosy_lab/java_smt/api/StringFormulaManager.java index 8766c874af..3d3c04f061 100644 --- a/src/org/sosy_lab/java_smt/api/StringFormulaManager.java +++ b/src/org/sosy_lab/java_smt/api/StringFormulaManager.java @@ -12,6 +12,7 @@ import java.util.Arrays; import java.util.List; import org.sosy_lab.java_smt.api.NumeralFormula.IntegerFormula; +import org.sosy_lab.java_smt.basicimpl.AbstractStringFormulaManager; /** * Manager for dealing with string formulas. Functions come from This method accepts plain Java Strings with Unicode characters from the Basic Multilingual - * Plane (BMP) (codepoints in range [0x00000, 0x2FFFF]). JavaSMT handles escaping internally, as - * some solvers follow the SMTLIB standard and escape Unicode characters with curly braces. - * - *

Additionally, you can use SMTLIB escaping like "\\u{1234}" to represent Unicode characters - * directly. + *

The String argument is expected to be a regular Java String and may contain Unicode + * characters from the first 3 planes (codepoints 0x00000-0x2FFFFF). Higher codepoints are not + * allowed due to limitations in SMTLIB. We do not support SMTLIB escape sequences in this method, + * and String like "\\u{abc}" are read verbatim and are not substituted with their + * Unicode character. If you still want to use SMTLIB Strings with this method, the function + * {@link AbstractStringFormulaManager#unescapeUnicodeForSmtlib(String)} can be used to handle the + * conversion before calling this method. Note that you may then also have to use {@link + * AbstractStringFormulaManager#escapeUnicodeForSmtlib(String)} to convert Strings from the model + * back to a format that is compatible with other SMTLIB based solvers. * * @param value the string value the returned {@link StringFormula} should represent * @return a {@link StringFormula} representing the given value @@ -246,8 +250,8 @@ default RegexFormula concat(RegexFormula... parts) { /** * Returns a String formula representing the single character with the given code point, if it is - * a valid Unicode code point within the Basic Multilingual Plane (BMP) (codepoints in range - * [0x00000, 0x2FFFF]). Otherwise, returns the empty string. + * a valid Unicode code point within the Basic Multilingual Plane (BMP) or planes 1 and 2 + * (codepoints in range [0x00000, 0x2FFFF]). Otherwise, returns the empty string. */ StringFormula fromCodePoint(IntegerFormula codePoint); } diff --git a/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java b/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java index 9b68dfa57b..71af01b1a4 100644 --- a/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java +++ b/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java @@ -38,7 +38,7 @@ public abstract class AbstractStringFormulaManager[0-9a-fA-F]{4})" + "|" // or curly brackets like "\\u{61}" - + "(\\{(?[0-9a-fA-F]{1,5})}))"); + + "(\\{(?[0-9a-fA-F]{1,5})\\}))"); protected AbstractStringFormulaManager( FormulaCreator pCreator) { @@ -60,28 +60,29 @@ protected IntegerFormula wrapInteger(TFormulaInfo formulaInfo) { @Override public StringFormula makeString(String value) { checkArgument( - areAllCodePointsInRange(value), - "String constant is out of supported Unicode range (Plane 0-2)."); + areAllCodePointsInRange(value, 0, 0x2FFFF), + "String constants may only contain Unicode characters from the first three planes " + + "(codepoints 0x00000 to 0x2FFFF)."); return wrapString(makeStringImpl(value)); } - /** returns whether all Unicode characters in Planes 0-2. */ - private static boolean areAllCodePointsInRange(String str) { - return str.codePoints().allMatch(AbstractStringFormulaManager::isCodePointInRange); - } - - private static boolean isCodePointInRange(int codePoint) { - return 0x00000 <= codePoint && codePoint <= 0x2FFFF; + /** Check if the codepoints of all characters in the String are in range. */ + private static boolean areAllCodePointsInRange(String str, int lower, int upper) { + return str.codePoints().allMatch(codePoint -> lower <= codePoint && codePoint <= upper); } /** Replace Unicode letters in UTF16 representation with their escape sequences. */ - protected static String escapeUnicodeForSmtlib(String input) { + public static String escapeUnicodeForSmtlib(String input) { StringBuilder sb = new StringBuilder(); for (int codePoint : input.codePoints().toArray()) { - if (0x20 <= codePoint && codePoint <= 0x7E) { + if (codePoint == 0x5c) { + // Backslashes must be escaped, otherwise they may get substituted when reading back + // the results from the model + sb.append("\\u{5c}"); + } else if (0x20 <= codePoint && codePoint <= 0x7E) { sb.appendCodePoint(codePoint); // normal printable chars } else { - sb.append("\\u{").append(String.format("%05X", codePoint)).append("}"); + sb.append("\\u{").append(String.format("%x", codePoint)).append("}"); } } return sb.toString(); @@ -98,9 +99,15 @@ public static String unescapeUnicodeForSmtlib(String input) { } int codePoint = Integer.parseInt(hexCodePoint, 16); checkArgument( - isCodePointInRange(codePoint), + 0 <= codePoint && codePoint <= 0x2FFFF, "SMTLIB does only specify Unicode letters from Planes 0-2"); - matcher.appendReplacement(sb, Character.toString(codePoint)); + String replacement = Character.toString(codePoint); + if (replacement.equals("\\")) { + // Matcher.appendReplacement considers '\' as special character. + // Substitute with '\\' instead + replacement = "\\\\"; + } + matcher.appendReplacement(sb, replacement); } matcher.appendTail(sb); return sb.toString(); diff --git a/src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java b/src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java index 0e167b418a..fdb360f8b6 100644 --- a/src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java +++ b/src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java @@ -29,7 +29,6 @@ class CVC4StringFormulaManager extends AbstractStringFormulaManagerThe input String may still contain Unicode characters. + */ + private StringFormula makeStringEscaped(String pString) { + return smgr.makeString(AbstractStringFormulaManager.unescapeUnicodeForSmtlib(pString)); + } + private void requireVariableStringLiterals() { // FIXME: Remove once support for operations on non-singleton Strings has been added // See https://github.com/uuverifiers/ostrich/issues/88 @@ -121,6 +134,42 @@ private void requireVariableStringLiterals() { // Tests + @Test + public void testInputEscape() throws SolverException, InterruptedException { + // Check that escape sequences are not substituted + assertEqual(smgr.length(smgr.makeString("\\u{39E}")), imgr.makeNumber(7)); + // ...unless we're doing so explicitly: + assertEqual(smgr.length(makeStringEscaped("\\u{39E}")), imgr.makeNumber(1)); + + // Test that Unicode characters are working + assertEqual(smgr.length(smgr.makeString("Ξ")), imgr.makeNumber(1)); + if (solver != Solvers.PRINCESS) { + // Test with a character that is not in the BMP + String str = Character.toString(0x200cb); + assertEqual(smgr.length(smgr.makeString(str)), imgr.makeNumber(1)); + } + } + + @Test + public void testOutputUnescape() throws SolverException, InterruptedException { + // Test if Unicode escape sequences get properly converted back when reading from the model. + try (ProverEnvironment prover = context.newProverEnvironment(ProverOptions.GENERATE_MODELS)) { + assertThat(!prover.isUnsat()).isTrue(); + try (Model model = prover.getModel()) { + assertThat(model.evaluate(makeStringEscaped("\\u{39E}"))).isEqualTo("Ξ"); + assertThat( + model.evaluate(smgr.concat(makeStringEscaped("\\u{39E"), makeStringEscaped("}")))) + .isEqualTo("\\u{39E}"); + + // Test with a character that is not in the BMP + if (solver != Solvers.PRINCESS && solver != Solvers.CVC5) { + String str = Character.toString(0x200cb); + assertThat(model.evaluate(smgr.makeString(str))).isEqualTo(str); + } + } + } + } + @Test public void testRegexAll() throws SolverException, InterruptedException { RegexFormula regex = smgr.all(); @@ -162,18 +211,18 @@ public void testRegexAllCharUnicode() throws SolverException, InterruptedExcepti // special single characters. // Greek Capital Letter Delta - assertThatFormula(smgr.in(smgr.makeString("\\u0394"), regexAllChar)).isTautological(); - assertThatFormula(smgr.in(smgr.makeString("Δ"), regexAllChar)).isTautological(); + assertThatFormula(smgr.in(makeStringEscaped("\\u0394"), regexAllChar)).isTautological(); + assertThatFormula(smgr.in(makeStringEscaped("Δ"), regexAllChar)).isTautological(); // CJK Compatibility Ideograph from Basic Multilingual Plane. - assertThatFormula(smgr.in(smgr.makeString("\\u{fa6a}"), regexAllChar)).isTautological(); + assertThatFormula(smgr.in(makeStringEscaped("\\u{fa6a}"), regexAllChar)).isTautological(); // Xiangqi Black Horse from Supplementary Multilingual Plane - assertThatFormula(smgr.in(smgr.makeString("\\u{1fa6a}"), regexAllChar)).isTautological(); + assertThatFormula(smgr.in(makeStringEscaped("\\u{1fa6a}"), regexAllChar)).isTautological(); // Combining characters are not matched as one character. - assertThatFormula(smgr.in(smgr.makeString("ab"), regexAllChar)).isUnsatisfiable(); - assertThatFormula(smgr.in(smgr.makeString("abcdefgh"), regexAllChar)).isUnsatisfiable(); - assertThatFormula(smgr.in(smgr.makeString("a\\u0336"), regexAllChar)).isUnsatisfiable(); - assertThatFormula(smgr.in(smgr.makeString("\\n"), regexAllChar)).isUnsatisfiable(); + assertThatFormula(smgr.in(makeStringEscaped("ab"), regexAllChar)).isUnsatisfiable(); + assertThatFormula(smgr.in(makeStringEscaped("abcdefgh"), regexAllChar)).isUnsatisfiable(); + assertThatFormula(smgr.in(makeStringEscaped("a\\u0336"), regexAllChar)).isUnsatisfiable(); + assertThatFormula(smgr.in(makeStringEscaped("\\n"), regexAllChar)).isUnsatisfiable(); StringFormula x = smgr.makeVariable("x"); assertThatFormula(smgr.in(x, smgr.range('a', 'z'))).isSatisfiable(); @@ -365,7 +414,7 @@ public void testStringLength() throws SolverException, InterruptedException { assertEqual(imgr.makeNumber(1), smgr.length(smgr.makeString("\n"))); assertEqual(imgr.makeNumber(1), smgr.length(smgr.makeString("\t"))); - assertEqual(imgr.makeNumber(1), smgr.length(smgr.makeString("\\u{20AC}"))); + assertEqual(imgr.makeNumber(1), smgr.length(makeStringEscaped("\\u{20AC}"))); assertEqual(imgr.makeNumber(1), smgr.length(smgr.makeString("€"))); assertEqual(imgr.makeNumber(1), smgr.length(smgr.makeString("Δ"))); @@ -870,8 +919,8 @@ public void testCharAtWithSpecialCharacters() throws SolverException, Interrupte String workaround = "au{1234}"; StringFormula au1234WOEscapeCurly = smgr.makeString(workaround); StringFormula backSlash = smgr.makeString("\\"); - StringFormula u1234 = smgr.makeString("\\u{1234}"); - StringFormula au1234b = smgr.makeString("a\\u{1234}b"); + StringFormula u1234 = makeStringEscaped("\\u{1234}"); + StringFormula au1234b = makeStringEscaped("a\\u{1234}b"); assertEqual(smgr.length(backSlash), imgr.makeNumber(1)); assertEqual(smgr.charAt(au1234b, imgr.makeNumber(0)), a); @@ -901,22 +950,22 @@ public void testCharAtWithSpecialCharacters() throws SolverException, Interrupte @Test public void testUnicodeEscaping() throws SolverException, InterruptedException { // SMTLIB has different representations for the same symbol - assertEqual(a, smgr.makeString("\u0061")); - assertEqual(a, smgr.makeString("\\u0061")); - assertEqual(a, smgr.makeString("\\u{61}")); - assertEqual(a, smgr.makeString("\\u{00061}")); + assertEqual(a, makeStringEscaped("\u0061")); + assertEqual(a, makeStringEscaped("\\u0061")); + assertEqual(a, makeStringEscaped("\\u{61}")); + assertEqual(a, makeStringEscaped("\\u{00061}")); assertEqual(smgr.length(a), imgr.makeNumber(1)); - StringFormula u0 = smgr.makeString("\\u0000"); - assertEqual(u0, smgr.makeString("\u0000")); - assertEqual(u0, smgr.makeString("\\u{0}")); - assertEqual(u0, smgr.makeString("\\u{00000}")); + StringFormula u0 = makeStringEscaped("\\u0000"); + assertEqual(u0, makeStringEscaped("\u0000")); + assertEqual(u0, makeStringEscaped("\\u{0}")); + assertEqual(u0, makeStringEscaped("\\u{00000}")); assertEqual(smgr.length(u0), imgr.makeNumber(1)); - StringFormula u1 = smgr.makeString("\\u1234"); - assertEqual(u1, smgr.makeString("\u1234")); - assertEqual(u1, smgr.makeString("\\u{1234}")); - assertEqual(u1, smgr.makeString("\\u{01234}")); + StringFormula u1 = makeStringEscaped("\\u1234"); + assertEqual(u1, makeStringEscaped("\u1234")); + assertEqual(u1, makeStringEscaped("\\u{1234}")); + assertEqual(u1, makeStringEscaped("\\u{01234}")); assertEqual(smgr.length(u1), imgr.makeNumber(1)); if (solverToUse() == Solvers.PRINCESS) { @@ -926,10 +975,10 @@ public void testUnicodeEscaping() throws SolverException, InterruptedException { Character.toString(0x10000), "\\u{2FFFF}", Character.toString(0x2FFFF))) { - assertThrows(IllegalArgumentException.class, () -> smgr.makeString(invalidStr)); + assertThrows(IllegalArgumentException.class, () -> makeStringEscaped(invalidStr)); } } else { - StringFormula u2 = smgr.makeString("\\u{2FFFF}"); + StringFormula u2 = makeStringEscaped("\\u{2FFFF}"); assertEqual(u2, smgr.makeString(Character.toString(0x2FFFF))); assertEqual(smgr.length(u2), imgr.makeNumber(1)); } @@ -943,10 +992,10 @@ public void testUnicodeEscaping() throws SolverException, InterruptedException { public void testCharAtWithSpecialCharacters2Byte() throws SolverException, InterruptedException { StringFormula num7 = smgr.makeString("7"); StringFormula u = smgr.makeString("u"); - StringFormula curlyOpen2BUnicode = smgr.makeString("\\u{7B}"); - StringFormula curlyClose2BUnicode = smgr.makeString("\\u{7D}"); - StringFormula acurlyClose2BUnicodeb = smgr.makeString("a\\u{7D}b"); - StringFormula acurlyOpen2BUnicodeWOEscapeCurly = smgr.makeString("au{7B}"); + StringFormula curlyOpen2BUnicode = makeStringEscaped("\\u{7B}"); + StringFormula curlyClose2BUnicode = makeStringEscaped("\\u{7D}"); + StringFormula acurlyClose2BUnicodeb = makeStringEscaped("a\\u{7D}b"); + StringFormula acurlyOpen2BUnicodeWOEscapeCurly = makeStringEscaped("au{7B}"); StringFormula backSlash = smgr.makeString("\\"); assertEqual(smgr.length(backSlash), imgr.makeNumber(1)); @@ -1035,10 +1084,10 @@ public void testConstStringContains() throws SolverException, InterruptedExcepti StringFormula abbbbbb = smgr.makeString("abbbbbb"); StringFormula aaaaaaaB = smgr.makeString("aaaaaaaB"); StringFormula abcAndSoOn = smgr.makeString("abcdefghijklmnopqrstuVwxyz"); - StringFormula curlyOpen2BUnicode = smgr.makeString("\\u{7B}"); - StringFormula curlyClose2BUnicode = smgr.makeString("\\u{7D}"); - StringFormula multipleCurlys2BUnicode = smgr.makeString("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); - StringFormula curlyClose2BUnicodeEncased = smgr.makeString("blabla\\u{7D}bla"); + StringFormula curlyOpen2BUnicode = makeStringEscaped("\\u{7B}"); + StringFormula curlyClose2BUnicode = makeStringEscaped("\\u{7D}"); + StringFormula multipleCurlys2BUnicode = makeStringEscaped("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); + StringFormula curlyClose2BUnicodeEncased = makeStringEscaped("blabla\\u{7D}bla"); assertThatFormula(smgr.contains(empty, empty)).isSatisfiable(); assertThatFormula(smgr.contains(empty, a)).isUnsatisfiable(); @@ -1078,8 +1127,8 @@ public void testStringVariableContains() throws SolverException, InterruptedExce StringFormula bUppercase = smgr.makeString("B"); StringFormula bbbbbb = smgr.makeString("bbbbbb"); StringFormula abbbbbb = smgr.makeString("abbbbbb"); - StringFormula curlyOpen2BUnicode = smgr.makeString("\\u{7B}"); - StringFormula curlyClose2BUnicode = smgr.makeString("\\u{7D}"); + StringFormula curlyOpen2BUnicode = makeStringEscaped("\\u{7B}"); + StringFormula curlyClose2BUnicode = makeStringEscaped("\\u{7D}"); assertThatFormula( bmgr.and(smgr.contains(var1, empty), imgr.equal(imgr.makeNumber(0), smgr.length(var1)))) @@ -1140,10 +1189,10 @@ public void testConstStringIndexOf() throws SolverException, InterruptedExceptio StringFormula bbbbbbb = smgr.makeString("bbbbbbb"); StringFormula abbbbbb = smgr.makeString("abbbbbb"); StringFormula abcAndSoOn = smgr.makeString("abcdefghijklmnopqrstuVwxyz"); - StringFormula curlyOpen2BUnicode = smgr.makeString("\\u{7B}"); - StringFormula curlyClose2BUnicode = smgr.makeString("\\u{7D}"); - StringFormula multipleCurlys2BUnicode = smgr.makeString("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); - StringFormula curlys2BUnicodeWOEscape = smgr.makeString("\\u7B\\u7D"); + StringFormula curlyOpen2BUnicode = makeStringEscaped("\\u{7B}"); + StringFormula curlyClose2BUnicode = makeStringEscaped("\\u{7D}"); + StringFormula multipleCurlys2BUnicode = makeStringEscaped("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); + StringFormula curlys2BUnicodeWOEscape = makeStringEscaped("\\u7B\\u7D"); IntegerFormula zero = imgr.makeNumber(0); IntegerFormula one = imgr.makeNumber(1); @@ -1178,7 +1227,7 @@ public void testStringVariableIndexOf() throws SolverException, InterruptedExcep StringFormula var2 = smgr.makeVariable("var2"); IntegerFormula intVar = imgr.makeVariable("intVar"); - StringFormula curlyOpen2BUnicode = smgr.makeString("\\u{7B}"); + StringFormula curlyOpen2BUnicode = makeStringEscaped("\\u{7B}"); IntegerFormula zero = imgr.makeNumber(0); @@ -1281,9 +1330,9 @@ public void testConstStringSubStrings() throws SolverException, InterruptedExcep StringFormula aUppercase = smgr.makeString("A"); StringFormula bUppercase = smgr.makeString("B"); StringFormula bbbbbb = smgr.makeString("bbbbbb"); - StringFormula curlyOpen2BUnicode = smgr.makeString("\\u{7B}"); - StringFormula curlyClose2BUnicode = smgr.makeString("\\u{7D}"); - StringFormula multipleCurlys2BUnicode = smgr.makeString("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); + StringFormula curlyOpen2BUnicode = makeStringEscaped("\\u{7B}"); + StringFormula curlyClose2BUnicode = makeStringEscaped("\\u{7D}"); + StringFormula multipleCurlys2BUnicode = makeStringEscaped("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); IntegerFormula zero = imgr.makeNumber(0); IntegerFormula one = imgr.makeNumber(1); @@ -1328,8 +1377,8 @@ public void testStringSubstringOutOfBounds() throws SolverException, Interrupted StringFormula bbbbbb = smgr.makeString("bbbbbb"); StringFormula abbbbbb = smgr.makeString("abbbbbb"); - StringFormula multipleCurlys2BUnicode = smgr.makeString("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); - StringFormula multipleCurlys2BUnicodeFromIndex1 = smgr.makeString("\\u{7D}\\u{7B}\\u{7B}"); + StringFormula multipleCurlys2BUnicode = makeStringEscaped("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); + StringFormula multipleCurlys2BUnicodeFromIndex1 = makeStringEscaped("\\u{7D}\\u{7B}\\u{7B}"); assertEqual(smgr.substring(abbbbbb, imgr.makeNumber(0), imgr.makeNumber(10000)), abbbbbb); assertEqual(smgr.substring(abbbbbb, imgr.makeNumber(6), imgr.makeNumber(10000)), b); @@ -1622,7 +1671,7 @@ public void testConstStringReplaceAll() throws SolverException, InterruptedExcep .isNotEqualTo(Solvers.Z3); for (int i = 0; i < WORDS.size(); i++) { - for (int j = 1; j < WORDS.size(); j++) { + for (int j = 0; j < WORDS.size(); j++) { String word1 = WORDS.get(i); String word2 = WORDS.get(j); String word3 = "replacement"; @@ -1630,7 +1679,8 @@ public void testConstStringReplaceAll() throws SolverException, InterruptedExcep StringFormula word2F = smgr.makeString(word2); StringFormula word3F = smgr.makeString(word3); - StringFormula result = smgr.makeString(word3.replaceAll(word2, word1)); + StringFormula result = + smgr.makeString(word2.isEmpty() ? word3 : word3.replaceAll(word2, word1)); assertEqual(smgr.replaceAll(word3F, word2F, word1F), result); } } @@ -1724,11 +1774,11 @@ public void testStringVariableReplaceAllSubstring() throws SolverException, Inte public void testStringConcatWUnicode() throws SolverException, InterruptedException { StringFormula backslash = smgr.makeString("\\"); StringFormula u = smgr.makeString("u"); - StringFormula curlyOpen = smgr.makeString("\\u{7B}"); + StringFormula curlyOpen = makeStringEscaped("\\u{7B}"); StringFormula sevenB = smgr.makeString("7B"); - StringFormula curlyClose = smgr.makeString("\\u{7D}"); + StringFormula curlyClose = makeStringEscaped("\\u{7D}"); StringFormula concat = smgr.concat(backslash, u, curlyOpen, sevenB, curlyClose); - StringFormula complete = smgr.makeString("\\u{7B}"); + StringFormula complete = makeStringEscaped("\\u{7B}"); // Concatting parts of unicode does not result in the unicode char! assertDistinct(concat, complete);