diff --git a/search-core/src/main/java/org/opengroup/osdu/search/util/QueryParserUtil.java b/search-core/src/main/java/org/opengroup/osdu/search/util/QueryParserUtil.java index c5363a9aa68f2f7b18c90f98f20a763222e0f980..9efc33a131aea4652419c1ac5eb54d8da4a4767b 100644 --- a/search-core/src/main/java/org/opengroup/osdu/search/util/QueryParserUtil.java +++ b/search-core/src/main/java/org/opengroup/osdu/search/util/QueryParserUtil.java @@ -1,9 +1,8 @@ package org.opengroup.osdu.search.util; import co.elastic.clients.elasticsearch._types.query_dsl.BoolQuery; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; + +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpStatus; @@ -132,27 +131,44 @@ public class QueryParserUtil implements IQueryParserUtil { while (m.find()) { orPositions.add(m.start()); } + if(!this.hasBalancedQuotes(queryString)) { + throw new AppException(HttpStatus.SC_BAD_REQUEST, "Malformed query", + String.format("Malformed unbalanced double quotes in query: \"%s\"", queryString)); + } + StringBuilder token = new StringBuilder(); List<String> tokens = new ArrayList<>(); - for (char c : queryString.toCharArray()) { + boolean doubleQuoteStarted = false; + char[] queryChars = queryString.toCharArray(); + for (char c : queryChars) { if (token.length() != 0 || c != ' ') { token.append(c); } - if (c == '(') { - height++; - } else if (c == ')') { - if (height == 1 && token.length() > 0) { + if(isDoubleQuote(queryChars, position)) { + if(doubleQuoteStarted) { + doubleQuoteStarted = false; + } + else { + doubleQuoteStarted = true; + } + } + if(!doubleQuoteStarted) { + if (c == '(') { + height++; + } else if (c == ')') { + if (height == 1 && token.length() > 0) { + tokens.add(token.toString()); + token = new StringBuilder(); + } + height--; + if (height < 0) { + throw new AppException(HttpStatus.SC_BAD_REQUEST, "Malformed query", + String.format("Malformed closing parentheses in query part: \"%s\", at position: %d", queryString, position)); + } + } else if (height == 0 && token.length() > 0 && (andPositions.contains(position + 1) || orPositions.contains(position + 1))) { tokens.add(token.toString()); token = new StringBuilder(); } - height--; - if (height < 0) { - throw new AppException(HttpStatus.SC_BAD_REQUEST, "Malformed query", - String.format("Malformed closing parentheses in query part: \"%s\", at position: %d", queryString, position)); - } - } else if (height == 0 && token.length() > 0 && (andPositions.contains(position + 1) || orPositions.contains(position + 1))) { - tokens.add(token.toString()); - token = new StringBuilder(); } position++; } @@ -169,6 +185,52 @@ public class QueryParserUtil implements IQueryParserUtil { return transformStringTokensToQueryNode(tokens); } + private boolean isDoubleQuote(char[] queryChars, int position) { + if(queryChars[position] != '"') + return false; + + int lastEscapeCharacterPosition = -1; + for(int i = position -1; i >= 0; i--) { + if(queryChars[i] == '\\') { + lastEscapeCharacterPosition = i; + } + else { + break; + } + } + return (lastEscapeCharacterPosition == -1 || (position - lastEscapeCharacterPosition)%2 == 0); + } + + private boolean hasBalancedQuotes(String query) { + Deque<Character> stack = new ArrayDeque<>(); + + int escapeCharStartPosition = -1; + for (int i = 0; i < query.length(); i++) { + char currentChar = query.charAt(i); + + // If the current character is an escape character + if (currentChar == '\\') { + if(escapeCharStartPosition == -1) { + escapeCharStartPosition = i; + } + continue; + } + + // If the current character is an unescaped quote + if (currentChar == '"' && (escapeCharStartPosition == -1 || (i - escapeCharStartPosition)%2 == 0)) { + if (stack.isEmpty()) { + stack.push(currentChar); // Open a new pair of quotes + } else if (stack.peek() == '"') { + stack.pop(); // Close the pair of quotes + } + } + escapeCharStartPosition = -1; + } + + // If the stack is empty, all quotes are balanced + return stack.isEmpty(); + } + private List<QueryNode> transformStringTokensToQueryNode(List<String> tokens) { if (tokens.size() > 1) { tokens.set(0, defineLeadingTokenOperator(tokens.get(0), tokens.get(1))); diff --git a/search-core/src/test/resources/testqueries/expected/nested-query-with-AND-OR-keywords-inside-double-quotes.json b/search-core/src/test/resources/testqueries/expected/nested-query-with-AND-OR-keywords-inside-double-quotes.json new file mode 100644 index 0000000000000000000000000000000000000000..74b91644d39b0b4f731a5f067f128bc8780072f1 --- /dev/null +++ b/search-core/src/test/resources/testqueries/expected/nested-query-with-AND-OR-keywords-inside-double-quotes.json @@ -0,0 +1,51 @@ +{ + "bool": { + "boost": 1.0, + "should": [{ + "query_string": { + "boost": 1.0, + "allow_leading_wildcard": false, + "auto_generate_synonyms_phrase_query": true, + "default_operator": "or", + "enable_position_increments": true, + "escape": false, + "fields": [], + "fuzziness": "AUTO", + "fuzzy_max_expansions": 50, + "fuzzy_prefix_length": 0, + "fuzzy_transpositions": true, + "max_determinized_states": 10000, + "phrase_slop": 0.0, + "query": "\"test AND test2 OR test3\"", + "type": "best_fields" + } + }, { + "nested": { + "boost": 1.0, + "ignore_unmapped": true, + "path": "data.comments", + "query": { + "query_string": { + "boost": 1.0, + "allow_leading_wildcard": false, + "auto_generate_synonyms_phrase_query": true, + "default_operator": "or", + "enable_position_increments": true, + "escape": false, + "fields": [], + "fuzziness": "AUTO", + "fuzzy_max_expansions": 50, + "fuzzy_prefix_length": 0, + "fuzzy_transpositions": true, + "max_determinized_states": 10000, + "phrase_slop": 0.0, + "query": "(\"test AND test2 OR test3\")", + "type": "best_fields" + } + }, + "score_mode": "avg" + } + } + ] + } +} diff --git a/search-core/src/test/resources/testqueries/expected/nested-query-with-escaped-quote-inside-double-quotes.json b/search-core/src/test/resources/testqueries/expected/nested-query-with-escaped-quote-inside-double-quotes.json new file mode 100644 index 0000000000000000000000000000000000000000..f6e356f7a8d89f4954fada6f340e12da7c7da244 --- /dev/null +++ b/search-core/src/test/resources/testqueries/expected/nested-query-with-escaped-quote-inside-double-quotes.json @@ -0,0 +1,51 @@ +{ + "bool": { + "boost": 1.0, + "should": [{ + "query_string": { + "boost": 1.0, + "allow_leading_wildcard": false, + "auto_generate_synonyms_phrase_query": true, + "default_operator": "or", + "enable_position_increments": true, + "escape": false, + "fields": [], + "fuzziness": "AUTO", + "fuzzy_max_expansions": 50, + "fuzzy_prefix_length": 0, + "fuzzy_transpositions": true, + "max_determinized_states": 10000, + "phrase_slop": 0.0, + "query": "\"bala\\\" AND bala\"", + "type": "best_fields" + } + }, { + "nested": { + "boost": 1.0, + "ignore_unmapped": true, + "path": "data.comments", + "query": { + "query_string": { + "boost": 1.0, + "allow_leading_wildcard": false, + "auto_generate_synonyms_phrase_query": true, + "default_operator": "or", + "enable_position_increments": true, + "escape": false, + "fields": [], + "fuzziness": "AUTO", + "fuzzy_max_expansions": 50, + "fuzzy_prefix_length": 0, + "fuzzy_transpositions": true, + "max_determinized_states": 10000, + "phrase_slop": 0.0, + "query": "(\"bala\\\" AND bala\")", + "type": "best_fields" + } + }, + "score_mode": "avg" + } + } + ] + } +} diff --git a/search-core/src/test/resources/testqueries/expected/nested-query-with-parent-keywords-inside-double-quotes.json b/search-core/src/test/resources/testqueries/expected/nested-query-with-parent-keywords-inside-double-quotes.json new file mode 100644 index 0000000000000000000000000000000000000000..c10f86734161367aae8681b65db5881e6927ece5 --- /dev/null +++ b/search-core/src/test/resources/testqueries/expected/nested-query-with-parent-keywords-inside-double-quotes.json @@ -0,0 +1,51 @@ +{ + "bool": { + "boost": 1.0, + "should": [{ + "query_string": { + "boost": 1.0, + "allow_leading_wildcard": false, + "auto_generate_synonyms_phrase_query": true, + "default_operator": "or", + "enable_position_increments": true, + "escape": false, + "fields": [], + "fuzziness": "AUTO", + "fuzzy_max_expansions": 50, + "fuzzy_prefix_length": 0, + "fuzzy_transpositions": true, + "max_determinized_states": 10000, + "phrase_slop": 0.0, + "query": "\"test(v1)\"", + "type": "best_fields" + } + }, { + "nested": { + "boost": 1.0, + "ignore_unmapped": true, + "path": "data.comments", + "query": { + "query_string": { + "boost": 1.0, + "allow_leading_wildcard": false, + "auto_generate_synonyms_phrase_query": true, + "default_operator": "or", + "enable_position_increments": true, + "escape": false, + "fields": [], + "fuzziness": "AUTO", + "fuzzy_max_expansions": 50, + "fuzzy_prefix_length": 0, + "fuzzy_transpositions": true, + "max_determinized_states": 10000, + "phrase_slop": 0.0, + "query": "(\"test(v1)\")", + "type": "best_fields" + } + }, + "score_mode": "avg" + } + } + ] + } +} diff --git a/search-core/src/test/resources/testqueries/top-level-nodes-count.json b/search-core/src/test/resources/testqueries/top-level-nodes-count.json index e1faae0f150221e7a458334fbc766fba0a790cef..e9a56cf8e76b3d01269bf1c291bcde842e8fc03b 100644 --- a/search-core/src/test/resources/testqueries/top-level-nodes-count.json +++ b/search-core/src/test/resources/testqueries/top-level-nodes-count.json @@ -15,5 +15,8 @@ "inner-nested-query": 1, "inner-multilevel-nested-query": 1, "nested-query-with-space": 1, - "multilevel-nested-query-with-space": 1 + "multilevel-nested-query-with-space": 1, + "nested-query-with-AND-OR-keywords-inside-double-quotes": 2, + "nested-query-with-parent-keywords-inside-double-quotes": 2, + "nested-query-with-escaped-quote-inside-double-quotes": 2 } diff --git a/search-core/src/test/resources/testqueries/valid-queries.json b/search-core/src/test/resources/testqueries/valid-queries.json index 124a6675c6feecf7dd8d79bae0d24c0d0d081ab4..c198ada97fc3d7404dc4ee8578d11d58564bd5a6 100644 --- a/search-core/src/test/resources/testqueries/valid-queries.json +++ b/search-core/src/test/resources/testqueries/valid-queries.json @@ -15,6 +15,9 @@ "inner-nested-query": "(nested(data.NestedTest, (NumberTest:(12345.0 OR 0) AND StringTest:\"test string\")) AND data.First:\"Example*\")", "inner-multilevel-nested-query": "(nested(data.FirstLevel, nested(data.FirstLevel.SecondLevel, nested(data.FirstLevel.SecondLevel.ThirdLevel, (ThirdLevelNumberTest:\"12345.0\")))))", "nested-query-with-space": "nested (data.NestedTest,(NumberTest:(12345.0 OR 0) AND StringTest:\"test string\"))", - "multilevel-nested-query-with-space": "nested (data.FirstLevel, nested(data.FirstLevel.SecondLevel, nested(data.FirstLevel.SecondLevel.ThirdLevel,(ThirdLevelNumberTest:\"12345.0\"))))" + "multilevel-nested-query-with-space": "nested (data.FirstLevel, nested(data.FirstLevel.SecondLevel, nested(data.FirstLevel.SecondLevel.ThirdLevel,(ThirdLevelNumberTest:\"12345.0\"))))", + "nested-query-with-AND-OR-keywords-inside-double-quotes": "\"test AND test2 OR test3\" OR nested(data.comments, (\"test AND test2 OR test3\"))", + "nested-query-with-parent-keywords-inside-double-quotes": "\"test(v1)\" OR nested(data.comments, (\"test(v1)\"))", + "nested-query-with-escaped-quote-inside-double-quotes":"\"bala\\\" AND bala\" OR nested(data.comments, (\"bala\\\" AND bala\"))" }