From 825d5f7a4a6c267e250e07e88ff010b34a92da64 Mon Sep 17 00:00:00 2001 From: Daniel Raffler Date: Mon, 25 Nov 2024 13:55:07 +0100 Subject: [PATCH 01/13] Strings: Added more tests for Unicode escaping --- .../java_smt/test/ModelEvaluationTest.java | 6 ++--- .../test/StringFormulaManagerTest.java | 23 +++++++++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/org/sosy_lab/java_smt/test/ModelEvaluationTest.java b/src/org/sosy_lab/java_smt/test/ModelEvaluationTest.java index f07ff9a6dc..16d3cff2ff 100644 --- a/src/org/sosy_lab/java_smt/test/ModelEvaluationTest.java +++ b/src/org/sosy_lab/java_smt/test/ModelEvaluationTest.java @@ -173,14 +173,12 @@ public void testGetStringsEvaluation() throws SolverException, InterruptedExcept Lists.newArrayList("hello \u00e6@\u20ac \u1234 \u4321"), Lists.newArrayList(smgr.makeString("hello \u00e6@\u20ac \u1234 \u4321"))); - // TODO Z3 and CVC4 seem to break escaping on invalid Unicode Strings. - /* - evaluateInModel( + // invalid Unicode escape sequences (should be treated as normal characters) + evaluateInModel( smgr.equal(smgr.makeVariable("x"), smgr.makeString("\\u")), smgr.makeVariable("x"), Lists.newArrayList("\\u"), Lists.newArrayList(smgr.makeString("\\u"))); - */ // foreign variable: x vs y evaluateInModel( diff --git a/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java b/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java index ca7cfc7985..8d6512f7a5 100644 --- a/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java +++ b/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java @@ -24,8 +24,11 @@ import org.sosy_lab.java_smt.SolverContextFactory.Solvers; import org.sosy_lab.java_smt.api.BooleanFormula; import org.sosy_lab.java_smt.api.Formula; +import org.sosy_lab.java_smt.api.Model; import org.sosy_lab.java_smt.api.NumeralFormula.IntegerFormula; +import org.sosy_lab.java_smt.api.ProverEnvironment; import org.sosy_lab.java_smt.api.RegexFormula; +import org.sosy_lab.java_smt.api.SolverContext.ProverOptions; import org.sosy_lab.java_smt.api.SolverException; import org.sosy_lab.java_smt.api.StringFormula; @@ -121,6 +124,26 @@ private void requireVariableStringLiterals() { // Tests + @Test + public void testInputEscape() throws SolverException, InterruptedException { + // Test if SMTLIB Unicode literals are recognized and converted to their Unicode characters. + assertEqual(smgr.length(smgr.makeString("Ξ")), imgr.makeNumber(1)); + assertEqual(smgr.length(smgr.makeString("\\u{39E}")), imgr.makeNumber(1)); + } + + @Test + public void testOutputUnescape() throws SolverException, InterruptedException { + // Test if Unicode escape sequences get properly converted back when reading from the model. + try (ProverEnvironment prover = context.newProverEnvironment(ProverOptions.GENERATE_MODELS)) { + assertThat(!prover.isUnsat()).isTrue(); + try (Model model = prover.getModel()) { + assertThat(model.evaluate(smgr.makeString("\\u{39E}"))).isEqualTo("Ξ"); + assertThat(model.evaluate(smgr.concat(smgr.makeString("\\u{39E"), smgr.makeString("}")))) + .isEqualTo("\\u{39E}"); + } + } + } + @Test public void testRegexAll() throws SolverException, InterruptedException { RegexFormula regex = smgr.all(); From 0d2be265a02af49df47da5487385783f281b9e21 Mon Sep 17 00:00:00 2001 From: Daniel Raffler Date: Thu, 16 Jan 2025 16:56:02 +0100 Subject: [PATCH 02/13] Strings: Patch a bug in testConstStringReplaceAll The test uses Strings.replaceAll and compares it to the result of str.replace_all in SMTLIB. However, the two functions behave different when the "matching" String is empty, and we need a special case for that. --- src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java b/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java index 8d6512f7a5..75c471d340 100644 --- a/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java +++ b/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java @@ -1645,7 +1645,7 @@ public void testConstStringReplaceAll() throws SolverException, InterruptedExcep .isNotEqualTo(Solvers.Z3); for (int i = 0; i < WORDS.size(); i++) { - for (int j = 1; j < WORDS.size(); j++) { + for (int j = 0; j < WORDS.size(); j++) { String word1 = WORDS.get(i); String word2 = WORDS.get(j); String word3 = "replacement"; @@ -1653,7 +1653,8 @@ public void testConstStringReplaceAll() throws SolverException, InterruptedExcep StringFormula word2F = smgr.makeString(word2); StringFormula word3F = smgr.makeString(word3); - StringFormula result = smgr.makeString(word3.replaceAll(word2, word1)); + StringFormula result = + smgr.makeString(word2.isEmpty() ? word3 : word3.replaceAll(word2, word1)); assertEqual(smgr.replaceAll(word3F, word2F, word1F), result); } } From 0f314158df503ced854586fff0901caf37f18207 Mon Sep 17 00:00:00 2001 From: Daniel Raffler Date: Wed, 1 Jan 2025 18:31:48 +0100 Subject: [PATCH 03/13] Strings: Fix pattern for \u{...} escape sequences. The final '}' in the pattern should also be escaped. --- .../java_smt/basicimpl/AbstractStringFormulaManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java b/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java index 9b68dfa57b..cdd32fc3b0 100644 --- a/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java +++ b/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java @@ -38,7 +38,7 @@ public abstract class AbstractStringFormulaManager[0-9a-fA-F]{4})" + "|" // or curly brackets like "\\u{61}" - + "(\\{(?[0-9a-fA-F]{1,5})}))"); + + "(\\{(?[0-9a-fA-F]{1,5})\\}))"); protected AbstractStringFormulaManager( FormulaCreator pCreator) { From 0b8d1a332076c9852e1f2dff0a1f5595fcc52c15 Mon Sep 17 00:00:00 2001 From: Daniel Raffler Date: Wed, 1 Jan 2025 18:33:30 +0100 Subject: [PATCH 04/13] Strings: Escape backslashes when creating String literals. This is needed to protect the backslash from substitution later when getting the results from the model. --- .../java_smt/basicimpl/AbstractStringFormulaManager.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java b/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java index cdd32fc3b0..87a2da0bb1 100644 --- a/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java +++ b/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java @@ -78,7 +78,11 @@ private static boolean isCodePointInRange(int codePoint) { protected static String escapeUnicodeForSmtlib(String input) { StringBuilder sb = new StringBuilder(); for (int codePoint : input.codePoints().toArray()) { - if (0x20 <= codePoint && codePoint <= 0x7E) { + if (codePoint == 0x5c) { + // Backslashes must be escaped, otherwise they may get substituted when reading back + // the results from the model + sb.append("\\u{5c}"); + } else if (0x20 <= codePoint && codePoint <= 0x7E) { sb.appendCodePoint(codePoint); // normal printable chars } else { sb.append("\\u{").append(String.format("%05X", codePoint)).append("}"); From 6738564aa9c493cee1104636f99ea0bdc1afe61f Mon Sep 17 00:00:00 2001 From: Daniel Raffler Date: Wed, 1 Jan 2025 18:36:46 +0100 Subject: [PATCH 05/13] Strings: Add a separate case for the escape sequence "\u{5c}" in unescapeUnicodeForSmtlib() as backslashes (= codepoint 5c) are considered special characters by Matcher.appendReplacement() --- .../java_smt/basicimpl/AbstractStringFormulaManager.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java b/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java index 87a2da0bb1..6660d58231 100644 --- a/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java +++ b/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java @@ -104,7 +104,13 @@ public static String unescapeUnicodeForSmtlib(String input) { checkArgument( isCodePointInRange(codePoint), "SMTLIB does only specify Unicode letters from Planes 0-2"); - matcher.appendReplacement(sb, Character.toString(codePoint)); + String replacement = Character.toString(codePoint); + if (replacement.equals("\\")) { + // Matcher.appendReplacement considers '\' as special character. + // Substitute with '\\' instead + replacement = "\\\\"; + } + matcher.appendReplacement(sb, replacement); } matcher.appendTail(sb); return sb.toString(); From 1f3d5c59c3f9fc0efbc2037e97534865fe7cea6b Mon Sep 17 00:00:00 2001 From: Daniel Raffler Date: Wed, 1 Jan 2025 18:42:56 +0100 Subject: [PATCH 06/13] Strings: Always unescape the String first when creating a new String constant. This is needed even for solvers like CVC4+5 or Z3 that expect Unicode characters to be escaped. We first need to unescape the String to resolve any escape sequences from the user, and then apply escape again before sending the String to the solver. --- .../java_smt/solvers/cvc4/CVC4StringFormulaManager.java | 4 ++-- .../java_smt/solvers/cvc5/CVC5StringFormulaManager.java | 3 ++- .../sosy_lab/java_smt/solvers/z3/Z3StringFormulaManager.java | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java b/src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java index 0e167b418a..e72ec9bf30 100644 --- a/src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java +++ b/src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java @@ -29,8 +29,8 @@ class CVC4StringFormulaManager extends AbstractStringFormulaManager Date: Tue, 25 Feb 2025 13:37:28 +0100 Subject: [PATCH 07/13] Strings: Add tests for Unicode characters that are not in the BMP --- .../java_smt/test/StringFormulaManagerTest.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java b/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java index 75c471d340..9e5439b8a4 100644 --- a/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java +++ b/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java @@ -127,8 +127,14 @@ private void requireVariableStringLiterals() { @Test public void testInputEscape() throws SolverException, InterruptedException { // Test if SMTLIB Unicode literals are recognized and converted to their Unicode characters. - assertEqual(smgr.length(smgr.makeString("Ξ")), imgr.makeNumber(1)); assertEqual(smgr.length(smgr.makeString("\\u{39E}")), imgr.makeNumber(1)); + assertEqual(smgr.length(smgr.makeString("Ξ")), imgr.makeNumber(1)); + + // Test with a character that is not in the BMP + if (solver != Solvers.PRINCESS) { + String str = Character.toString(0x200cb); + assertEqual(smgr.length(smgr.makeString(str)), imgr.makeNumber(1)); + } } @Test @@ -140,6 +146,12 @@ public void testOutputUnescape() throws SolverException, InterruptedException { assertThat(model.evaluate(smgr.makeString("\\u{39E}"))).isEqualTo("Ξ"); assertThat(model.evaluate(smgr.concat(smgr.makeString("\\u{39E"), smgr.makeString("}")))) .isEqualTo("\\u{39E}"); + + // Test with a character that is not in the BMP + if (solver != Solvers.PRINCESS) { + String str = Character.toString(0x200cb); + assertThat(model.evaluate(smgr.makeString(str))).isEqualTo(str); + } } } } From 2265152980204fb3761c00c6b00c34c5808c4293 Mon Sep 17 00:00:00 2001 From: Daniel Raffler Date: Thu, 27 Feb 2025 17:26:00 +0100 Subject: [PATCH 08/13] Strings: Disable non-BMP test for CVC5 due to a bug in the solver --- src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java b/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java index 9e5439b8a4..4e80d0bcaf 100644 --- a/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java +++ b/src/org/sosy_lab/java_smt/test/StringFormulaManagerTest.java @@ -148,7 +148,7 @@ public void testOutputUnescape() throws SolverException, InterruptedException { .isEqualTo("\\u{39E}"); // Test with a character that is not in the BMP - if (solver != Solvers.PRINCESS) { + if (solver != Solvers.PRINCESS && solver != Solvers.CVC5) { String str = Character.toString(0x200cb); assertThat(model.evaluate(smgr.makeString(str))).isEqualTo(str); } From 9cc096bd994f2fc81847c31bbaea40d6f33136ce Mon Sep 17 00:00:00 2001 From: Daniel Raffler Date: Tue, 25 Feb 2025 14:04:32 +0100 Subject: [PATCH 09/13] Strings: Clean up the documentation Strings in SMTLIB may contain Unicode characters from planes 0-2 --- src/org/sosy_lab/java_smt/api/StringFormulaManager.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/org/sosy_lab/java_smt/api/StringFormulaManager.java b/src/org/sosy_lab/java_smt/api/StringFormulaManager.java index 8766c874af..07fbf66179 100644 --- a/src/org/sosy_lab/java_smt/api/StringFormulaManager.java +++ b/src/org/sosy_lab/java_smt/api/StringFormulaManager.java @@ -23,8 +23,9 @@ public interface StringFormulaManager { * Creates a {@link StringFormula} representing the given constant String. * *

This method accepts plain Java Strings with Unicode characters from the Basic Multilingual - * Plane (BMP) (codepoints in range [0x00000, 0x2FFFF]). JavaSMT handles escaping internally, as - * some solvers follow the SMTLIB standard and escape Unicode characters with curly braces. + * Plane (BMP) or planes 1 and 2 (codepoints in range [0x00000, 0x2FFFF]). JavaSMT handles + * escaping internally, as some solvers follow the SMTLIB standard and escape Unicode characters + * with curly braces. * *

Additionally, you can use SMTLIB escaping like "\\u{1234}" to represent Unicode characters * directly. @@ -246,8 +247,8 @@ default RegexFormula concat(RegexFormula... parts) { /** * Returns a String formula representing the single character with the given code point, if it is - * a valid Unicode code point within the Basic Multilingual Plane (BMP) (codepoints in range - * [0x00000, 0x2FFFF]). Otherwise, returns the empty string. + * a valid Unicode code point within the Basic Multilingual Plane (BMP) or planes 1 and 2 + * (codepoints in range [0x00000, 0x2FFFF]). Otherwise, returns the empty string. */ StringFormula fromCodePoint(IntegerFormula codePoint); } From 06232f909ef2ab665cad07a3eb6530464daadfec Mon Sep 17 00:00:00 2001 From: Daniel Raffler Date: Tue, 25 Feb 2025 17:59:13 +0100 Subject: [PATCH 10/13] Strings: Remove support for Unicode escape sequences Mixing Unicode characters with SMTLIB escape sequences leads to confusing corner cases and has therefore been removed from StringFormulaManager.makeString. Users can call unescapeUnicodeForSmtlib() from AbstractStringFormulaManager to convert a String with escape sequences before handing it of to makeString(). In the other direction escapeUnicodeForSmtlib() can be used to convert (Unicode) Strings from the model into an escaped format that is compatible with other SMTLIB based solvers. --- .../AbstractStringFormulaManager.java | 19 ++- .../cvc4/CVC4StringFormulaManager.java | 3 +- .../cvc5/CVC5StringFormulaManager.java | 3 +- .../PrincessStringFormulaManager.java | 1 - .../java_smt/solvers/z3/Z3FormulaCreator.java | 5 +- .../solvers/z3/Z3StringFormulaManager.java | 3 +- .../test/StringFormulaManagerTest.java | 122 ++++++++++-------- 7 files changed, 82 insertions(+), 74 deletions(-) diff --git a/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java b/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java index 6660d58231..549d0b918a 100644 --- a/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java +++ b/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java @@ -60,22 +60,19 @@ protected IntegerFormula wrapInteger(TFormulaInfo formulaInfo) { @Override public StringFormula makeString(String value) { checkArgument( - areAllCodePointsInRange(value), - "String constant is out of supported Unicode range (Plane 0-2)."); + areAllCodePointsInRange(value, 0, 0x2FFFF), + "String constants may only contain Unicode characters from the first three planes " + + "(codepoints 0x00000 to 0x2FFFF)."); return wrapString(makeStringImpl(value)); } - /** returns whether all Unicode characters in Planes 0-2. */ - private static boolean areAllCodePointsInRange(String str) { - return str.codePoints().allMatch(AbstractStringFormulaManager::isCodePointInRange); - } - - private static boolean isCodePointInRange(int codePoint) { - return 0x00000 <= codePoint && codePoint <= 0x2FFFF; + /** Check if the codepoints of all characters in the String are in range. */ + private static boolean areAllCodePointsInRange(String str, int lower, int upper) { + return str.codePoints().allMatch(codePoint -> lower <= codePoint && codePoint <= upper); } /** Replace Unicode letters in UTF16 representation with their escape sequences. */ - protected static String escapeUnicodeForSmtlib(String input) { + public static String escapeUnicodeForSmtlib(String input) { StringBuilder sb = new StringBuilder(); for (int codePoint : input.codePoints().toArray()) { if (codePoint == 0x5c) { @@ -102,7 +99,7 @@ public static String unescapeUnicodeForSmtlib(String input) { } int codePoint = Integer.parseInt(hexCodePoint, 16); checkArgument( - isCodePointInRange(codePoint), + 0 <= codePoint && codePoint <= 0x2FFFF, "SMTLIB does only specify Unicode letters from Planes 0-2"); String replacement = Character.toString(codePoint); if (replacement.equals("\\")) { diff --git a/src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java b/src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java index e72ec9bf30..fdb360f8b6 100644 --- a/src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java +++ b/src/org/sosy_lab/java_smt/solvers/cvc4/CVC4StringFormulaManager.java @@ -29,8 +29,7 @@ class CVC4StringFormulaManager extends AbstractStringFormulaManagerThe input String may still contain Unicode characters. + */ + private StringFormula makeStringEscaped(String pString) { + return smgr.makeString(AbstractStringFormulaManager.unescapeUnicodeForSmtlib(pString)); + } + private void requireVariableStringLiterals() { // FIXME: Remove once support for operations on non-singleton Strings has been added // See https://github.com/uuverifiers/ostrich/issues/88 @@ -126,12 +136,15 @@ private void requireVariableStringLiterals() { @Test public void testInputEscape() throws SolverException, InterruptedException { - // Test if SMTLIB Unicode literals are recognized and converted to their Unicode characters. - assertEqual(smgr.length(smgr.makeString("\\u{39E}")), imgr.makeNumber(1)); - assertEqual(smgr.length(smgr.makeString("Ξ")), imgr.makeNumber(1)); + // Check that escape sequences are not substituted + assertEqual(smgr.length(smgr.makeString("\\u{39E}")), imgr.makeNumber(7)); + // ...unless we're doing so explicitly: + assertEqual(smgr.length(makeStringEscaped("\\u{39E}")), imgr.makeNumber(1)); - // Test with a character that is not in the BMP + // Test that Unicode characters are working + assertEqual(smgr.length(smgr.makeString("Ξ")), imgr.makeNumber(1)); if (solver != Solvers.PRINCESS) { + // Test with a character that is not in the BMP String str = Character.toString(0x200cb); assertEqual(smgr.length(smgr.makeString(str)), imgr.makeNumber(1)); } @@ -143,8 +156,9 @@ public void testOutputUnescape() throws SolverException, InterruptedException { try (ProverEnvironment prover = context.newProverEnvironment(ProverOptions.GENERATE_MODELS)) { assertThat(!prover.isUnsat()).isTrue(); try (Model model = prover.getModel()) { - assertThat(model.evaluate(smgr.makeString("\\u{39E}"))).isEqualTo("Ξ"); - assertThat(model.evaluate(smgr.concat(smgr.makeString("\\u{39E"), smgr.makeString("}")))) + assertThat(model.evaluate(makeStringEscaped("\\u{39E}"))).isEqualTo("Ξ"); + assertThat( + model.evaluate(smgr.concat(makeStringEscaped("\\u{39E"), makeStringEscaped("}")))) .isEqualTo("\\u{39E}"); // Test with a character that is not in the BMP @@ -197,18 +211,18 @@ public void testRegexAllCharUnicode() throws SolverException, InterruptedExcepti // special single characters. // Greek Capital Letter Delta - assertThatFormula(smgr.in(smgr.makeString("\\u0394"), regexAllChar)).isTautological(); - assertThatFormula(smgr.in(smgr.makeString("Δ"), regexAllChar)).isTautological(); + assertThatFormula(smgr.in(makeStringEscaped("\\u0394"), regexAllChar)).isTautological(); + assertThatFormula(smgr.in(makeStringEscaped("Δ"), regexAllChar)).isTautological(); // CJK Compatibility Ideograph from Basic Multilingual Plane. - assertThatFormula(smgr.in(smgr.makeString("\\u{fa6a}"), regexAllChar)).isTautological(); + assertThatFormula(smgr.in(makeStringEscaped("\\u{fa6a}"), regexAllChar)).isTautological(); // Xiangqi Black Horse from Supplementary Multilingual Plane - assertThatFormula(smgr.in(smgr.makeString("\\u{1fa6a}"), regexAllChar)).isTautological(); + assertThatFormula(smgr.in(makeStringEscaped("\\u{1fa6a}"), regexAllChar)).isTautological(); // Combining characters are not matched as one character. - assertThatFormula(smgr.in(smgr.makeString("ab"), regexAllChar)).isUnsatisfiable(); - assertThatFormula(smgr.in(smgr.makeString("abcdefgh"), regexAllChar)).isUnsatisfiable(); - assertThatFormula(smgr.in(smgr.makeString("a\\u0336"), regexAllChar)).isUnsatisfiable(); - assertThatFormula(smgr.in(smgr.makeString("\\n"), regexAllChar)).isUnsatisfiable(); + assertThatFormula(smgr.in(makeStringEscaped("ab"), regexAllChar)).isUnsatisfiable(); + assertThatFormula(smgr.in(makeStringEscaped("abcdefgh"), regexAllChar)).isUnsatisfiable(); + assertThatFormula(smgr.in(makeStringEscaped("a\\u0336"), regexAllChar)).isUnsatisfiable(); + assertThatFormula(smgr.in(makeStringEscaped("\\n"), regexAllChar)).isUnsatisfiable(); StringFormula x = smgr.makeVariable("x"); assertThatFormula(smgr.in(x, smgr.range('a', 'z'))).isSatisfiable(); @@ -400,7 +414,7 @@ public void testStringLength() throws SolverException, InterruptedException { assertEqual(imgr.makeNumber(1), smgr.length(smgr.makeString("\n"))); assertEqual(imgr.makeNumber(1), smgr.length(smgr.makeString("\t"))); - assertEqual(imgr.makeNumber(1), smgr.length(smgr.makeString("\\u{20AC}"))); + assertEqual(imgr.makeNumber(1), smgr.length(makeStringEscaped("\\u{20AC}"))); assertEqual(imgr.makeNumber(1), smgr.length(smgr.makeString("€"))); assertEqual(imgr.makeNumber(1), smgr.length(smgr.makeString("Δ"))); @@ -905,8 +919,8 @@ public void testCharAtWithSpecialCharacters() throws SolverException, Interrupte String workaround = "au{1234}"; StringFormula au1234WOEscapeCurly = smgr.makeString(workaround); StringFormula backSlash = smgr.makeString("\\"); - StringFormula u1234 = smgr.makeString("\\u{1234}"); - StringFormula au1234b = smgr.makeString("a\\u{1234}b"); + StringFormula u1234 = makeStringEscaped("\\u{1234}"); + StringFormula au1234b = makeStringEscaped("a\\u{1234}b"); assertEqual(smgr.length(backSlash), imgr.makeNumber(1)); assertEqual(smgr.charAt(au1234b, imgr.makeNumber(0)), a); @@ -936,22 +950,22 @@ public void testCharAtWithSpecialCharacters() throws SolverException, Interrupte @Test public void testUnicodeEscaping() throws SolverException, InterruptedException { // SMTLIB has different representations for the same symbol - assertEqual(a, smgr.makeString("\u0061")); - assertEqual(a, smgr.makeString("\\u0061")); - assertEqual(a, smgr.makeString("\\u{61}")); - assertEqual(a, smgr.makeString("\\u{00061}")); + assertEqual(a, makeStringEscaped("\u0061")); + assertEqual(a, makeStringEscaped("\\u0061")); + assertEqual(a, makeStringEscaped("\\u{61}")); + assertEqual(a, makeStringEscaped("\\u{00061}")); assertEqual(smgr.length(a), imgr.makeNumber(1)); - StringFormula u0 = smgr.makeString("\\u0000"); - assertEqual(u0, smgr.makeString("\u0000")); - assertEqual(u0, smgr.makeString("\\u{0}")); - assertEqual(u0, smgr.makeString("\\u{00000}")); + StringFormula u0 = makeStringEscaped("\\u0000"); + assertEqual(u0, makeStringEscaped("\u0000")); + assertEqual(u0, makeStringEscaped("\\u{0}")); + assertEqual(u0, makeStringEscaped("\\u{00000}")); assertEqual(smgr.length(u0), imgr.makeNumber(1)); - StringFormula u1 = smgr.makeString("\\u1234"); - assertEqual(u1, smgr.makeString("\u1234")); - assertEqual(u1, smgr.makeString("\\u{1234}")); - assertEqual(u1, smgr.makeString("\\u{01234}")); + StringFormula u1 = makeStringEscaped("\\u1234"); + assertEqual(u1, makeStringEscaped("\u1234")); + assertEqual(u1, makeStringEscaped("\\u{1234}")); + assertEqual(u1, makeStringEscaped("\\u{01234}")); assertEqual(smgr.length(u1), imgr.makeNumber(1)); if (solverToUse() == Solvers.PRINCESS) { @@ -961,10 +975,10 @@ public void testUnicodeEscaping() throws SolverException, InterruptedException { Character.toString(0x10000), "\\u{2FFFF}", Character.toString(0x2FFFF))) { - assertThrows(IllegalArgumentException.class, () -> smgr.makeString(invalidStr)); + assertThrows(IllegalArgumentException.class, () -> makeStringEscaped(invalidStr)); } } else { - StringFormula u2 = smgr.makeString("\\u{2FFFF}"); + StringFormula u2 = makeStringEscaped("\\u{2FFFF}"); assertEqual(u2, smgr.makeString(Character.toString(0x2FFFF))); assertEqual(smgr.length(u2), imgr.makeNumber(1)); } @@ -978,10 +992,10 @@ public void testUnicodeEscaping() throws SolverException, InterruptedException { public void testCharAtWithSpecialCharacters2Byte() throws SolverException, InterruptedException { StringFormula num7 = smgr.makeString("7"); StringFormula u = smgr.makeString("u"); - StringFormula curlyOpen2BUnicode = smgr.makeString("\\u{7B}"); - StringFormula curlyClose2BUnicode = smgr.makeString("\\u{7D}"); - StringFormula acurlyClose2BUnicodeb = smgr.makeString("a\\u{7D}b"); - StringFormula acurlyOpen2BUnicodeWOEscapeCurly = smgr.makeString("au{7B}"); + StringFormula curlyOpen2BUnicode = makeStringEscaped("\\u{7B}"); + StringFormula curlyClose2BUnicode = makeStringEscaped("\\u{7D}"); + StringFormula acurlyClose2BUnicodeb = makeStringEscaped("a\\u{7D}b"); + StringFormula acurlyOpen2BUnicodeWOEscapeCurly = makeStringEscaped("au{7B}"); StringFormula backSlash = smgr.makeString("\\"); assertEqual(smgr.length(backSlash), imgr.makeNumber(1)); @@ -1070,10 +1084,10 @@ public void testConstStringContains() throws SolverException, InterruptedExcepti StringFormula abbbbbb = smgr.makeString("abbbbbb"); StringFormula aaaaaaaB = smgr.makeString("aaaaaaaB"); StringFormula abcAndSoOn = smgr.makeString("abcdefghijklmnopqrstuVwxyz"); - StringFormula curlyOpen2BUnicode = smgr.makeString("\\u{7B}"); - StringFormula curlyClose2BUnicode = smgr.makeString("\\u{7D}"); - StringFormula multipleCurlys2BUnicode = smgr.makeString("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); - StringFormula curlyClose2BUnicodeEncased = smgr.makeString("blabla\\u{7D}bla"); + StringFormula curlyOpen2BUnicode = makeStringEscaped("\\u{7B}"); + StringFormula curlyClose2BUnicode = makeStringEscaped("\\u{7D}"); + StringFormula multipleCurlys2BUnicode = makeStringEscaped("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); + StringFormula curlyClose2BUnicodeEncased = makeStringEscaped("blabla\\u{7D}bla"); assertThatFormula(smgr.contains(empty, empty)).isSatisfiable(); assertThatFormula(smgr.contains(empty, a)).isUnsatisfiable(); @@ -1113,8 +1127,8 @@ public void testStringVariableContains() throws SolverException, InterruptedExce StringFormula bUppercase = smgr.makeString("B"); StringFormula bbbbbb = smgr.makeString("bbbbbb"); StringFormula abbbbbb = smgr.makeString("abbbbbb"); - StringFormula curlyOpen2BUnicode = smgr.makeString("\\u{7B}"); - StringFormula curlyClose2BUnicode = smgr.makeString("\\u{7D}"); + StringFormula curlyOpen2BUnicode = makeStringEscaped("\\u{7B}"); + StringFormula curlyClose2BUnicode = makeStringEscaped("\\u{7D}"); assertThatFormula( bmgr.and(smgr.contains(var1, empty), imgr.equal(imgr.makeNumber(0), smgr.length(var1)))) @@ -1175,10 +1189,10 @@ public void testConstStringIndexOf() throws SolverException, InterruptedExceptio StringFormula bbbbbbb = smgr.makeString("bbbbbbb"); StringFormula abbbbbb = smgr.makeString("abbbbbb"); StringFormula abcAndSoOn = smgr.makeString("abcdefghijklmnopqrstuVwxyz"); - StringFormula curlyOpen2BUnicode = smgr.makeString("\\u{7B}"); - StringFormula curlyClose2BUnicode = smgr.makeString("\\u{7D}"); - StringFormula multipleCurlys2BUnicode = smgr.makeString("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); - StringFormula curlys2BUnicodeWOEscape = smgr.makeString("\\u7B\\u7D"); + StringFormula curlyOpen2BUnicode = makeStringEscaped("\\u{7B}"); + StringFormula curlyClose2BUnicode = makeStringEscaped("\\u{7D}"); + StringFormula multipleCurlys2BUnicode = makeStringEscaped("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); + StringFormula curlys2BUnicodeWOEscape = makeStringEscaped("\\u7B\\u7D"); IntegerFormula zero = imgr.makeNumber(0); IntegerFormula one = imgr.makeNumber(1); @@ -1213,7 +1227,7 @@ public void testStringVariableIndexOf() throws SolverException, InterruptedExcep StringFormula var2 = smgr.makeVariable("var2"); IntegerFormula intVar = imgr.makeVariable("intVar"); - StringFormula curlyOpen2BUnicode = smgr.makeString("\\u{7B}"); + StringFormula curlyOpen2BUnicode = makeStringEscaped("\\u{7B}"); IntegerFormula zero = imgr.makeNumber(0); @@ -1316,9 +1330,9 @@ public void testConstStringSubStrings() throws SolverException, InterruptedExcep StringFormula aUppercase = smgr.makeString("A"); StringFormula bUppercase = smgr.makeString("B"); StringFormula bbbbbb = smgr.makeString("bbbbbb"); - StringFormula curlyOpen2BUnicode = smgr.makeString("\\u{7B}"); - StringFormula curlyClose2BUnicode = smgr.makeString("\\u{7D}"); - StringFormula multipleCurlys2BUnicode = smgr.makeString("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); + StringFormula curlyOpen2BUnicode = makeStringEscaped("\\u{7B}"); + StringFormula curlyClose2BUnicode = makeStringEscaped("\\u{7D}"); + StringFormula multipleCurlys2BUnicode = makeStringEscaped("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); IntegerFormula zero = imgr.makeNumber(0); IntegerFormula one = imgr.makeNumber(1); @@ -1363,8 +1377,8 @@ public void testStringSubstringOutOfBounds() throws SolverException, Interrupted StringFormula bbbbbb = smgr.makeString("bbbbbb"); StringFormula abbbbbb = smgr.makeString("abbbbbb"); - StringFormula multipleCurlys2BUnicode = smgr.makeString("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); - StringFormula multipleCurlys2BUnicodeFromIndex1 = smgr.makeString("\\u{7D}\\u{7B}\\u{7B}"); + StringFormula multipleCurlys2BUnicode = makeStringEscaped("\\u{7B}\\u{7D}\\u{7B}\\u{7B}"); + StringFormula multipleCurlys2BUnicodeFromIndex1 = makeStringEscaped("\\u{7D}\\u{7B}\\u{7B}"); assertEqual(smgr.substring(abbbbbb, imgr.makeNumber(0), imgr.makeNumber(10000)), abbbbbb); assertEqual(smgr.substring(abbbbbb, imgr.makeNumber(6), imgr.makeNumber(10000)), b); @@ -1760,11 +1774,11 @@ public void testStringVariableReplaceAllSubstring() throws SolverException, Inte public void testStringConcatWUnicode() throws SolverException, InterruptedException { StringFormula backslash = smgr.makeString("\\"); StringFormula u = smgr.makeString("u"); - StringFormula curlyOpen = smgr.makeString("\\u{7B}"); + StringFormula curlyOpen = makeStringEscaped("\\u{7B}"); StringFormula sevenB = smgr.makeString("7B"); - StringFormula curlyClose = smgr.makeString("\\u{7D}"); + StringFormula curlyClose = makeStringEscaped("\\u{7D}"); StringFormula concat = smgr.concat(backslash, u, curlyOpen, sevenB, curlyClose); - StringFormula complete = smgr.makeString("\\u{7B}"); + StringFormula complete = makeStringEscaped("\\u{7B}"); // Concatting parts of unicode does not result in the unicode char! assertDistinct(concat, complete); From 9aaefafad9f3e438ba7b47799a8e62eafaa116cb Mon Sep 17 00:00:00 2001 From: Daniel Raffler Date: Tue, 25 Feb 2025 19:59:08 +0100 Subject: [PATCH 11/13] Strings: Fix evaluation of Unicode String formulas in ModelEvaluationTest --- src/org/sosy_lab/java_smt/test/ModelEvaluationTest.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/org/sosy_lab/java_smt/test/ModelEvaluationTest.java b/src/org/sosy_lab/java_smt/test/ModelEvaluationTest.java index 16d3cff2ff..8974490f3d 100644 --- a/src/org/sosy_lab/java_smt/test/ModelEvaluationTest.java +++ b/src/org/sosy_lab/java_smt/test/ModelEvaluationTest.java @@ -25,6 +25,7 @@ import org.sosy_lab.java_smt.api.ProverEnvironment; import org.sosy_lab.java_smt.api.SolverContext.ProverOptions; import org.sosy_lab.java_smt.api.SolverException; +import org.sosy_lab.java_smt.basicimpl.AbstractStringFormulaManager; /** Test that we can request evaluations from models. */ public class ModelEvaluationTest extends SolverBasedTest0.ParameterizedSolverBasedTest0 { @@ -168,7 +169,11 @@ public void testGetStringsEvaluation() throws SolverException, InterruptedExcept // Unicode evaluateInModel( - smgr.equal(smgr.makeVariable("x"), smgr.makeString("hello æ@€ \u1234 \\u{4321}")), + smgr.equal( + smgr.makeVariable("x"), + smgr.makeString( + AbstractStringFormulaManager.unescapeUnicodeForSmtlib( + "hello æ@€ \u1234 \\u{4321}"))), smgr.makeVariable("x"), Lists.newArrayList("hello \u00e6@\u20ac \u1234 \u4321"), Lists.newArrayList(smgr.makeString("hello \u00e6@\u20ac \u1234 \u4321"))); From cabdad20f7ba501bae0b647d9ecb788d5b147ec2 Mon Sep 17 00:00:00 2001 From: Daniel Raffler Date: Wed, 26 Feb 2025 14:02:26 +0100 Subject: [PATCH 12/13] Strings: Fix format of the hex constant when printing codepoints for Unicode escape sequences --- .../java_smt/basicimpl/AbstractStringFormulaManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java b/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java index 549d0b918a..71af01b1a4 100644 --- a/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java +++ b/src/org/sosy_lab/java_smt/basicimpl/AbstractStringFormulaManager.java @@ -82,7 +82,7 @@ public static String escapeUnicodeForSmtlib(String input) { } else if (0x20 <= codePoint && codePoint <= 0x7E) { sb.appendCodePoint(codePoint); // normal printable chars } else { - sb.append("\\u{").append(String.format("%05X", codePoint)).append("}"); + sb.append("\\u{").append(String.format("%x", codePoint)).append("}"); } } return sb.toString(); From 795b3160df87b275b663e2263a40e56069ec8c0c Mon Sep 17 00:00:00 2001 From: Daniel Raffler Date: Wed, 26 Feb 2025 14:49:13 +0100 Subject: [PATCH 13/13] Strings: Update JavaDoc for StringFormulaManager.makeString --- .../java_smt/api/StringFormulaManager.java | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/org/sosy_lab/java_smt/api/StringFormulaManager.java b/src/org/sosy_lab/java_smt/api/StringFormulaManager.java index 07fbf66179..3d3c04f061 100644 --- a/src/org/sosy_lab/java_smt/api/StringFormulaManager.java +++ b/src/org/sosy_lab/java_smt/api/StringFormulaManager.java @@ -12,6 +12,7 @@ import java.util.Arrays; import java.util.List; import org.sosy_lab.java_smt.api.NumeralFormula.IntegerFormula; +import org.sosy_lab.java_smt.basicimpl.AbstractStringFormulaManager; /** * Manager for dealing with string formulas. Functions come from This method accepts plain Java Strings with Unicode characters from the Basic Multilingual - * Plane (BMP) or planes 1 and 2 (codepoints in range [0x00000, 0x2FFFF]). JavaSMT handles - * escaping internally, as some solvers follow the SMTLIB standard and escape Unicode characters - * with curly braces. - * - *

Additionally, you can use SMTLIB escaping like "\\u{1234}" to represent Unicode characters - * directly. + *

The String argument is expected to be a regular Java String and may contain Unicode + * characters from the first 3 planes (codepoints 0x00000-0x2FFFFF). Higher codepoints are not + * allowed due to limitations in SMTLIB. We do not support SMTLIB escape sequences in this method, + * and String like "\\u{abc}" are read verbatim and are not substituted with their + * Unicode character. If you still want to use SMTLIB Strings with this method, the function + * {@link AbstractStringFormulaManager#unescapeUnicodeForSmtlib(String)} can be used to handle the + * conversion before calling this method. Note that you may then also have to use {@link + * AbstractStringFormulaManager#escapeUnicodeForSmtlib(String)} to convert Strings from the model + * back to a format that is compatible with other SMTLIB based solvers. * * @param value the string value the returned {@link StringFormula} should represent * @return a {@link StringFormula} representing the given value