From e4c5a941ee49bb7d239ba8210256b6fb0d70839c Mon Sep 17 00:00:00 2001 From: sirjonasxx <36828922+sirjonasxx@users.noreply.github.com> Date: Thu, 21 Jan 2021 04:32:12 +0100 Subject: [PATCH] utf-8 string prediction --- .../prediction/checkers/StringChecker.java | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/G-Earth/src/main/java/gearth/misc/packetrepresentation/prediction/checkers/StringChecker.java b/G-Earth/src/main/java/gearth/misc/packetrepresentation/prediction/checkers/StringChecker.java index fbb45dd..4815e32 100644 --- a/G-Earth/src/main/java/gearth/misc/packetrepresentation/prediction/checkers/StringChecker.java +++ b/G-Earth/src/main/java/gearth/misc/packetrepresentation/prediction/checkers/StringChecker.java @@ -64,13 +64,29 @@ public class StringChecker extends TypeChecker { }; for (int i = 0; i < s.length(); i++) { - score *= penalties[isCommon( - asChars[i], - asBytes[i] - )]; - if (score < 0.001) { - return 0; + // detect UTF8 extended chars + if ((asBytes[i] & 0b11100000) == 0b11000000 && i < s.length() - 1 && (asBytes[i+1] & 0b11000000) == 0b10000000) { + i += 1; + score *= penalties[2]*penalties[2]; + } + else if ((asBytes[i] & 0b11110000) == 0b11100000 && i < s.length() - 2 && (asBytes[i+1] & 0b11000000) == 0b10000000 && (asBytes[i+2] & 0b11000000) == 0b10000000) { + i += 2; + score *= penalties[2]*penalties[2]*penalties[2]; + } + else if ((asBytes[i] & 0b11111000) == 0b11110000 && i < s.length() - 3 && (asBytes[i+1] & 0b11000000) == 0b10000000 && (asBytes[i+2] & 0b11000000) == 0b10000000 && (asBytes[i+3] & 0b11000000) == 0b10000000) { + i += 3; + score *= penalties[2]*penalties[2]*penalties[2]*penalties[2]; + } + else { + score *= penalties[isCommon( + asChars[i], + asBytes[i] + )]; + + if (score < 0.001) { + return 0; + } } }