Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling of Unicode characters in String theory #431

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions src/org/sosy_lab/java_smt/api/StringFormulaManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import java.util.Arrays;
import java.util.List;
import org.sosy_lab.java_smt.api.NumeralFormula.IntegerFormula;
import org.sosy_lab.java_smt.basicimpl.AbstractStringFormulaManager;

/**
* Manager for dealing with string formulas. Functions come from <a
Expand All @@ -22,12 +23,15 @@ public interface StringFormulaManager {
/**
* Creates a {@link StringFormula} representing the given constant String.
*
* <p>This method accepts plain Java Strings with Unicode characters from the Basic Multilingual
* Plane (BMP) (codepoints in range [0x00000, 0x2FFFF]). JavaSMT handles escaping internally, as
* some solvers follow the SMTLIB standard and escape Unicode characters with curly braces.
*
* <p>Additionally, you can use SMTLIB escaping like "\\u{1234}" to represent Unicode characters
* directly.
* <p>The String argument is expected to be a regular Java String and may contain Unicode
* characters from the first 3 planes (codepoints 0x00000-0x2FFFFF). Higher codepoints are not
* allowed due to limitations in SMTLIB. We do not support SMTLIB escape sequences in this method,
* and String like <code>"\\u{abc}"</code> are read verbatim and are not substituted with their
* Unicode character. If you still want to use SMTLIB Strings with this method, the function
* {@link AbstractStringFormulaManager#unescapeUnicodeForSmtlib(String)} can be used to handle the
* conversion before calling this method. Note that you may then also have to use {@link
* AbstractStringFormulaManager#escapeUnicodeForSmtlib(String)} to convert Strings from the model
* back to a format that is compatible with other SMTLIB based solvers.
*
* @param value the string value the returned {@link StringFormula} should represent
* @return a {@link StringFormula} representing the given value
Expand Down Expand Up @@ -246,8 +250,8 @@ default RegexFormula concat(RegexFormula... parts) {

/**
* Returns a String formula representing the single character with the given code point, if it is
* a valid Unicode code point within the Basic Multilingual Plane (BMP) (codepoints in range
* [0x00000, 0x2FFFF]). Otherwise, returns the empty string.
* a valid Unicode code point within the Basic Multilingual Plane (BMP) or planes 1 and 2
* (codepoints in range [0x00000, 0x2FFFF]). Otherwise, returns the empty string.
*/
StringFormula fromCodePoint(IntegerFormula codePoint);
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public abstract class AbstractStringFormulaManager<TFormulaInfo, TType, TEnv, TF
+ "((?<codePoint>[0-9a-fA-F]{4})"
+ "|"
// or curly brackets like "\\u{61}"
+ "(\\{(?<codePointInBrackets>[0-9a-fA-F]{1,5})}))");
+ "(\\{(?<codePointInBrackets>[0-9a-fA-F]{1,5})\\}))");

protected AbstractStringFormulaManager(
FormulaCreator<TFormulaInfo, TType, TEnv, TFuncDecl> pCreator) {
Expand All @@ -60,28 +60,29 @@ protected IntegerFormula wrapInteger(TFormulaInfo formulaInfo) {
@Override
public StringFormula makeString(String value) {
checkArgument(
areAllCodePointsInRange(value),
"String constant is out of supported Unicode range (Plane 0-2).");
areAllCodePointsInRange(value, 0, 0x2FFFF),
"String constants may only contain Unicode characters from the first three planes "
+ "(codepoints 0x00000 to 0x2FFFF).");
return wrapString(makeStringImpl(value));
}

/** returns whether all Unicode characters in Planes 0-2. */
private static boolean areAllCodePointsInRange(String str) {
return str.codePoints().allMatch(AbstractStringFormulaManager::isCodePointInRange);
}

private static boolean isCodePointInRange(int codePoint) {
return 0x00000 <= codePoint && codePoint <= 0x2FFFF;
/** Check if the codepoints of all characters in the String are in range. */
private static boolean areAllCodePointsInRange(String str, int lower, int upper) {
return str.codePoints().allMatch(codePoint -> lower <= codePoint && codePoint <= upper);
}

/** Replace Unicode letters in UTF16 representation with their escape sequences. */
protected static String escapeUnicodeForSmtlib(String input) {
public static String escapeUnicodeForSmtlib(String input) {
StringBuilder sb = new StringBuilder();
for (int codePoint : input.codePoints().toArray()) {
if (0x20 <= codePoint && codePoint <= 0x7E) {
if (codePoint == 0x5c) {
// Backslashes must be escaped, otherwise they may get substituted when reading back
// the results from the model
sb.append("\\u{5c}");
} else if (0x20 <= codePoint && codePoint <= 0x7E) {
sb.appendCodePoint(codePoint); // normal printable chars
} else {
sb.append("\\u{").append(String.format("%05X", codePoint)).append("}");
sb.append("\\u{").append(String.format("%x", codePoint)).append("}");
}
}
return sb.toString();
Expand All @@ -98,9 +99,15 @@ public static String unescapeUnicodeForSmtlib(String input) {
}
int codePoint = Integer.parseInt(hexCodePoint, 16);
checkArgument(
isCodePointInRange(codePoint),
0 <= codePoint && codePoint <= 0x2FFFF,
"SMTLIB does only specify Unicode letters from Planes 0-2");
matcher.appendReplacement(sb, Character.toString(codePoint));
String replacement = Character.toString(codePoint);
if (replacement.equals("\\")) {
// Matcher.appendReplacement considers '\' as special character.
// Substitute with '\\' instead
replacement = "\\\\";
}
matcher.appendReplacement(sb, replacement);
}
matcher.appendTail(sb);
return sb.toString();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ class CVC4StringFormulaManager extends AbstractStringFormulaManager<Expr, Type,

@Override
protected Expr makeStringImpl(String pValue) {
// The boolean enables escape characters!
return exprManager.mkConst(new CVC4String(escapeUnicodeForSmtlib(pValue), true));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ public class PrincessStringFormulaManager

@Override
protected IExpression makeStringImpl(String value) {
value = unescapeUnicodeForSmtlib(value);
checkArgument(!containsSurrogatePair(value), "Princess does not support surrogate pairs.");
IExpression strExpr = PrincessEnvironment.stringTheory.string2Term(value);
return getFormulaCreator().getEnv().simplify(strExpr); // simplify MOD in chars
Expand Down
5 changes: 3 additions & 2 deletions src/org/sosy_lab/java_smt/solvers/z3/Z3FormulaCreator.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
package org.sosy_lab.java_smt.solvers.z3;

import static com.google.common.base.Preconditions.checkArgument;
import static org.sosy_lab.java_smt.basicimpl.AbstractStringFormulaManager.unescapeUnicodeForSmtlib;

import com.google.common.base.Preconditions;
import com.google.common.collect.HashBasedTable;
Expand Down Expand Up @@ -58,6 +57,7 @@
import org.sosy_lab.java_smt.api.SolverException;
import org.sosy_lab.java_smt.api.StringFormula;
import org.sosy_lab.java_smt.api.visitors.FormulaVisitor;
import org.sosy_lab.java_smt.basicimpl.AbstractStringFormulaManager;
import org.sosy_lab.java_smt.basicimpl.FormulaCreator;
import org.sosy_lab.java_smt.basicimpl.FunctionDeclarationImpl;
import org.sosy_lab.java_smt.solvers.z3.Z3Formula.Z3ArrayFormula;
Expand Down Expand Up @@ -921,7 +921,8 @@ public Object convertValue(Long value) {
Rational ratValue = Rational.ofString(Native.getNumeralString(environment, value));
return ratValue.isIntegral() ? ratValue.getNum() : ratValue;
} else if (type.isStringType()) {
return unescapeUnicodeForSmtlib(Native.getString(environment, value));
String str = Native.getString(environment, value);
return AbstractStringFormulaManager.unescapeUnicodeForSmtlib(str);
} else if (type.isBitvectorType()) {
return new BigInteger(Native.getNumeralString(environment, value));
} else if (type.isFloatingPointType()) {
Expand Down
13 changes: 8 additions & 5 deletions src/org/sosy_lab/java_smt/test/ModelEvaluationTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.sosy_lab.java_smt.api.ProverEnvironment;
import org.sosy_lab.java_smt.api.SolverContext.ProverOptions;
import org.sosy_lab.java_smt.api.SolverException;
import org.sosy_lab.java_smt.basicimpl.AbstractStringFormulaManager;

/** Test that we can request evaluations from models. */
public class ModelEvaluationTest extends SolverBasedTest0.ParameterizedSolverBasedTest0 {
Expand Down Expand Up @@ -168,19 +169,21 @@ public void testGetStringsEvaluation() throws SolverException, InterruptedExcept

// Unicode
evaluateInModel(
smgr.equal(smgr.makeVariable("x"), smgr.makeString("hello æ@€ \u1234 \\u{4321}")),
smgr.equal(
smgr.makeVariable("x"),
smgr.makeString(
AbstractStringFormulaManager.unescapeUnicodeForSmtlib(
"hello æ@€ \u1234 \\u{4321}"))),
smgr.makeVariable("x"),
Lists.newArrayList("hello \u00e6@\u20ac \u1234 \u4321"),
Lists.newArrayList(smgr.makeString("hello \u00e6@\u20ac \u1234 \u4321")));

// TODO Z3 and CVC4 seem to break escaping on invalid Unicode Strings.
/*
evaluateInModel(
// invalid Unicode escape sequences (should be treated as normal characters)
evaluateInModel(
smgr.equal(smgr.makeVariable("x"), smgr.makeString("\\u")),
smgr.makeVariable("x"),
Lists.newArrayList("\\u"),
Lists.newArrayList(smgr.makeString("\\u")));
*/

// foreign variable: x vs y
evaluateInModel(
Expand Down
Loading