From b5e026575853ac7a7d984c7937320d54a96d5436 Mon Sep 17 00:00:00 2001 From: Matthew Chen Date: Tue, 12 Jun 2018 16:40:28 -0400 Subject: [PATCH 1/8] Filter search index text. --- Signal/test/util/SearcherTest.swift | 6 +++ .../src/Storage/FullTextSearchFinder.swift | 52 +++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/Signal/test/util/SearcherTest.swift b/Signal/test/util/SearcherTest.swift index 0fcba4d57..2472a754e 100644 --- a/Signal/test/util/SearcherTest.swift +++ b/Signal/test/util/SearcherTest.swift @@ -341,4 +341,10 @@ class SearcherTest: XCTestCase { XCTAssert(searcher.matches(item: stinkingLizaveta, query: "Liza 323")) XCTAssertFalse(searcher.matches(item: regularLizaveta, query: "Liza 323")) } + + func testTextSanitization() { + XCTAssertEqual(FullTextSearchFinder.normalize(text: "Liza"), "Liza") + XCTAssertEqual(FullTextSearchFinder.normalize(text: "Liza +1-323"), "Liza 1 323") + XCTAssertEqual(FullTextSearchFinder.normalize(text: "\"\\'!&@#$%^&*()Liza +1-323"), "Liza 1 323") + } } diff --git a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift index e7bca46cf..b830d32c8 100644 --- a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift +++ b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift @@ -28,6 +28,7 @@ public class SearchIndexer { normalized = String(String.UnicodeScalarView(nonformattingScalars)) return normalized +// return FullTextSearchFinder.filterIndexOrQueryText(text: indexingText) } } @@ -64,6 +65,57 @@ public class FullTextSearchFinder: NSObject { } } + // Mark: Filtering + +// private class func characterSet(fromCharacter: UInt32, toCharacter: UInt32) -> CharacterSet { +// var string = "" +// // Add to include last character. +// for character in fromCharacter ..< toCharacter + 1 { +// guard let chr = Unicode.Scalar(character) else { +// assertionFailure("\(self.logTag) could not parse character.") +// continue +// } +// string += String(chr) +// } +// return CharacterSet(charactersIn: string) +// } +// +// private static var kFilterCharacters: CharacterSet = { +// var set = CharacterSet() +// set.formUnion(FullTextSearchFinder.characterSet(fromCharacter: 0, toCharacter: 31)) +// set.formUnion(FullTextSearchFinder.characterSet(fromCharacter: 33, toCharacter: 47)) +// set.formUnion(FullTextSearchFinder.characterSet(fromCharacter: 58, toCharacter: 64)) +// set.formUnion(FullTextSearchFinder.characterSet(fromCharacter: 91, toCharacter: 96)) +// set.formUnion(FullTextSearchFinder.characterSet(fromCharacter: 123, toCharacter: 126)) +// return set +// }() +// +// public class func filterIndexOrQueryText(text: String) -> String { +// let filteredScalars = String(text.unicodeScalars.lazy.map { +// if kFilterCharacters.contains($0) { +// return " " +// } else { +// return Character($0) +// } +// }) +// +// // Remove any phone number formatting from the search terms +// let nonformattingScalars = filteredScalars.unicodeScalars.lazy.filter { +// !CharacterSet.punctuationCharacters.contains($0) +// } +// +// var normalized = String(String.UnicodeScalarView(nonformattingScalars)) +// +// // Simplify the normalized text by combining adjacent whitespace. +// while normalized.contains(" ") { +// normalized = normalized.replacingOccurrences(of: " ", with: " ") +// } +// +// // We strip leading & trailing whitespace last, since we may replace +// // filtered characters with whitespace. +// return normalized.trimmingCharacters(in: .whitespacesAndNewlines) +// } + private func normalize(queryText: String) -> String { var normalized: String = queryText.trimmingCharacters(in: .whitespacesAndNewlines) From f5a5d84edcb95f226f6511dcc3c73353a4beb329 Mon Sep 17 00:00:00 2001 From: Matthew Chen Date: Thu, 14 Jun 2018 10:50:23 -0400 Subject: [PATCH 2/8] Filter search index text. --- .../src/Storage/FullTextSearchFinder.swift | 150 +++++++++--------- SignalServiceKit/src/Storage/OWSStorage.m | 11 +- 2 files changed, 80 insertions(+), 81 deletions(-) diff --git a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift index b830d32c8..467389ee6 100644 --- a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift +++ b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift @@ -18,17 +18,7 @@ public class SearchIndexer { } private func normalize(indexingText: String) -> String { - var normalized: String = indexingText.trimmingCharacters(in: .whitespacesAndNewlines) - - // Remove any punctuation from the search index - let nonformattingScalars = normalized.unicodeScalars.lazy.filter { - !CharacterSet.punctuationCharacters.contains($0) - } - - normalized = String(String.UnicodeScalarView(nonformattingScalars)) - - return normalized -// return FullTextSearchFinder.filterIndexOrQueryText(text: indexingText) + return FullTextSearchFinder.sanitize(text: indexingText) } } @@ -37,24 +27,42 @@ public class FullTextSearchFinder: NSObject { // Mark: Querying + // We want to match by prefix for "search as you type" functionality. + // SQLite does not support suffix or contains matches. + public class func query(searchText: String) -> String { + // 1. Normalize the search text. + let normalizedSearchText = normalize(queryText: searchText) + + // 2. Split into tokens. + let queryTerms = normalizedSearchText.split(separator: " ").filter { + // Ignore empty tokens. + $0.count > 0 + }.map { + // Allow partial match of each token. + $0 + "*" + } + + // 3. Join tokens into query string. + let query = queryTerms.joined(separator: " ") + return query + } + public func enumerateObjects(searchText: String, transaction: YapDatabaseReadTransaction, block: @escaping (Any, String) -> Void) { guard let ext: YapDatabaseFullTextSearchTransaction = ext(transaction: transaction) else { owsFail("\(logTag) ext was unexpectedly nil") return } - let normalized = normalize(queryText: searchText) + let query = FullTextSearchFinder.query(searchText: searchText) - // We want to match by prefix for "search as you type" functionality. - // SQLite does not support suffix or contains matches. - let prefixQuery = "\(normalized)*" + Logger.verbose("\(logTag) query: \(query)") let maxSearchResults = 500 var searchResultCount = 0 let snippetOptions = YapDatabaseFullTextSearchSnippetOptions() snippetOptions.startMatchText = "" snippetOptions.endMatchText = "" - ext.enumerateKeysAndObjects(matching: prefixQuery, with: snippetOptions) { (snippet: String, _: String, _: String, object: Any, stop: UnsafeMutablePointer) in + ext.enumerateKeysAndObjects(matching: query, with: snippetOptions) { (snippet: String, _: String, _: String, object: Any, stop: UnsafeMutablePointer) in guard searchResultCount < maxSearchResults else { stop.pointee = true return @@ -66,64 +74,49 @@ public class FullTextSearchFinder: NSObject { } // Mark: Filtering - -// private class func characterSet(fromCharacter: UInt32, toCharacter: UInt32) -> CharacterSet { -// var string = "" -// // Add to include last character. -// for character in fromCharacter ..< toCharacter + 1 { -// guard let chr = Unicode.Scalar(character) else { -// assertionFailure("\(self.logTag) could not parse character.") -// continue -// } -// string += String(chr) -// } -// return CharacterSet(charactersIn: string) -// } -// -// private static var kFilterCharacters: CharacterSet = { -// var set = CharacterSet() -// set.formUnion(FullTextSearchFinder.characterSet(fromCharacter: 0, toCharacter: 31)) -// set.formUnion(FullTextSearchFinder.characterSet(fromCharacter: 33, toCharacter: 47)) -// set.formUnion(FullTextSearchFinder.characterSet(fromCharacter: 58, toCharacter: 64)) -// set.formUnion(FullTextSearchFinder.characterSet(fromCharacter: 91, toCharacter: 96)) -// set.formUnion(FullTextSearchFinder.characterSet(fromCharacter: 123, toCharacter: 126)) -// return set -// }() -// -// public class func filterIndexOrQueryText(text: String) -> String { -// let filteredScalars = String(text.unicodeScalars.lazy.map { -// if kFilterCharacters.contains($0) { -// return " " -// } else { -// return Character($0) -// } -// }) -// -// // Remove any phone number formatting from the search terms -// let nonformattingScalars = filteredScalars.unicodeScalars.lazy.filter { -// !CharacterSet.punctuationCharacters.contains($0) -// } -// -// var normalized = String(String.UnicodeScalarView(nonformattingScalars)) -// -// // Simplify the normalized text by combining adjacent whitespace. -// while normalized.contains(" ") { -// normalized = normalized.replacingOccurrences(of: " ", with: " ") -// } -// -// // We strip leading & trailing whitespace last, since we may replace -// // filtered characters with whitespace. -// return normalized.trimmingCharacters(in: .whitespacesAndNewlines) -// } - - private func normalize(queryText: String) -> String { - var normalized: String = queryText.trimmingCharacters(in: .whitespacesAndNewlines) - - // Remove any punctuation from the search terms - let nonformattingScalars = normalized.unicodeScalars.lazy.filter { - !CharacterSet.punctuationCharacters.contains($0) + + fileprivate class func charactersToRemove() -> CharacterSet { + var charactersToFilter = CharacterSet.punctuationCharacters + charactersToFilter.formUnion(CharacterSet.illegalCharacters) + charactersToFilter.formUnion(CharacterSet.controlCharacters) + charactersToFilter.formUnion(CharacterSet.symbols) + return charactersToFilter + } + + fileprivate class func separatorCharacters() -> CharacterSet { + let separatorCharacters = CharacterSet.whitespacesAndNewlines + return separatorCharacters + } + + fileprivate class func sanitize(text: String) -> String { + // 1. Filter out invalid characters. + let filtered = text.unicodeScalars.lazy.filter({ + !charactersToRemove().contains($0) + }) + + // 2. Simplify whitespace. + let simplifyingFunction: (UnicodeScalar) -> UnicodeScalar = { + if separatorCharacters().contains($0) { + return UnicodeScalar(" ") + } else { + return $0 + } } - let normalizedChars = String(String.UnicodeScalarView(nonformattingScalars)) + let simplified = filtered.map(simplifyingFunction) + + // 3. Combine adjacent whitespace. + var result = String(String.UnicodeScalarView(simplified)) + while result.contains(" ") { + result = result.replacingOccurrences(of: " ", with: " ") + } + + // 4. Strip leading & trailing whitespace last, since we may replace + // filtered characters with whitespace. + return result.trimmingCharacters(in: .whitespacesAndNewlines) + } + + private class func normalize(queryText: String) -> String { + var normalized: String = FullTextSearchFinder.sanitize(text: queryText) let digitsOnlyScalars = normalized.unicodeScalars.lazy.filter { CharacterSet.decimalDigits.contains($0) @@ -131,9 +124,9 @@ public class FullTextSearchFinder: NSObject { let normalizedDigits = String(String.UnicodeScalarView(digitsOnlyScalars)) if normalizedDigits.count > 0 { - return "\(normalizedChars) OR \(normalizedDigits)" + return "\(normalized) OR \(normalizedDigits)" } else { - return "\(normalizedChars)" + return normalized } } @@ -231,8 +224,11 @@ public class FullTextSearchFinder: NSObject { } // update search index on contact name changes? - // update search index on message insertion? - return YapDatabaseFullTextSearch(columnNames: ["content"], handler: handler) + return YapDatabaseFullTextSearch(columnNames: ["content"], + options: nil, + handler: handler, + ftsVersion: YapDatabaseFullTextSearchFTS5Version, + versionTag: "1") } } diff --git a/SignalServiceKit/src/Storage/OWSStorage.m b/SignalServiceKit/src/Storage/OWSStorage.m index 8a61f5b2c..9f0d1ee11 100644 --- a/SignalServiceKit/src/Storage/OWSStorage.m +++ b/SignalServiceKit/src/Storage/OWSStorage.m @@ -542,10 +542,13 @@ NSString *const kNSUserDefaults_DatabaseExtensionVersionMap = @"kNSUserDefaults_ YapDatabaseFullTextSearch *fullTextSearch = (YapDatabaseFullTextSearch *)extension; NSString *versionTag = [self appendSuffixToDatabaseExtensionVersionIfNecessary:fullTextSearch.versionTag extensionName:extensionName]; - YapDatabaseFullTextSearch *fullTextSearchCopy = [[YapDatabaseFullTextSearch alloc] initWithColumnNames:fullTextSearch->columnNames.array - handler:fullTextSearch->handler - versionTag:versionTag]; - + YapDatabaseFullTextSearch *fullTextSearchCopy = + [[YapDatabaseFullTextSearch alloc] initWithColumnNames:fullTextSearch->columnNames.array + options:fullTextSearch->options + handler:fullTextSearch->handler + ftsVersion:fullTextSearch->ftsVersion + versionTag:versionTag]; + return fullTextSearchCopy; } else if ([extension isKindOfClass:[YapDatabaseCrossProcessNotification class]]) { // versionTag doesn't matter for YapDatabaseCrossProcessNotification. From a51e9b78b0e4c26c60d14e80714103bc97666445 Mon Sep 17 00:00:00 2001 From: Matthew Chen Date: Thu, 14 Jun 2018 11:46:32 -0400 Subject: [PATCH 3/8] Improve search query construction. --- Signal/test/util/SearcherTest.swift | 26 ++++++++-- SignalMessaging/contacts/OWSContactsManager.h | 1 - .../utils/ConversationSearcher.swift | 8 +-- .../src/Protocols/ContactsManagerProtocol.h | 3 ++ .../src/Storage/FullTextSearchFinder.swift | 50 +++++++++---------- 5 files changed, 53 insertions(+), 35 deletions(-) diff --git a/Signal/test/util/SearcherTest.swift b/Signal/test/util/SearcherTest.swift index 2472a754e..a8b98ca49 100644 --- a/Signal/test/util/SearcherTest.swift +++ b/Signal/test/util/SearcherTest.swift @@ -64,6 +64,10 @@ class FakeContactsManager: NSObject, ContactsManagerProtocol { func isSystemContact(withSignalAccount recipientId: String) -> Bool { return true } + + func compare(signalAccount left: SignalAccount, with right: SignalAccount) -> ComparisonResult { + return .orderedAscending + } } let bobRecipientId = "+49030183000" @@ -265,7 +269,7 @@ class ConversationSearcherTest: XCTestCase { private func getResultSet(searchText: String) -> SearchResultSet { var results: SearchResultSet! self.dbConnection.read { transaction in - results = self.searcher.results(searchText: searchText, transaction: transaction) + results = self.searcher.results(searchText: searchText, transaction: transaction, contactsManager: TextSecureKitEnv.shared().contactsManager) } return results } @@ -342,9 +346,23 @@ class SearcherTest: XCTestCase { XCTAssertFalse(searcher.matches(item: regularLizaveta, query: "Liza 323")) } - func testTextSanitization() { + func testSearchQuery() { + XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza"), "Liza*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza +1-323"), "1323* Liza*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "\"\\ `~!@#$%^&*()_+-={}|[]:;'<>?,./Liza +1-323"), "1323* Liza*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "renaldo RENALDO reñaldo REÑALDO"), "RENALDO* REÑALDO* renaldo* reñaldo*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "😏"), "😏*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "alice 123 bob 456"), "123* 123456* 456* alice* bob*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "Li!za"), "Liza*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza Liza"), "Liza*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza liza"), "Liza* liza*") + } + + func testTextNormalization() { XCTAssertEqual(FullTextSearchFinder.normalize(text: "Liza"), "Liza") - XCTAssertEqual(FullTextSearchFinder.normalize(text: "Liza +1-323"), "Liza 1 323") - XCTAssertEqual(FullTextSearchFinder.normalize(text: "\"\\'!&@#$%^&*()Liza +1-323"), "Liza 1 323") + XCTAssertEqual(FullTextSearchFinder.normalize(text: "Liza +1-323"), "Liza 1323") + XCTAssertEqual(FullTextSearchFinder.normalize(text: "\"\\ `~!@#$%^&*()_+-={}|[]:;'<>?,./Liza +1-323"), "Liza 1323") + XCTAssertEqual(FullTextSearchFinder.normalize(text: "renaldo RENALDO reñaldo REÑALDO"), "renaldo RENALDO reñaldo REÑALDO") + XCTAssertEqual(FullTextSearchFinder.normalize(text: "😏"), "😏") } } diff --git a/SignalMessaging/contacts/OWSContactsManager.h b/SignalMessaging/contacts/OWSContactsManager.h index 7dc66d88c..3a34898ba 100644 --- a/SignalMessaging/contacts/OWSContactsManager.h +++ b/SignalMessaging/contacts/OWSContactsManager.h @@ -71,7 +71,6 @@ extern NSString *const OWSContactsManagerSignalAccountsDidChangeNotification; * Used for sorting, respects system contacts name sort order preference. */ - (NSString *)comparableNameForSignalAccount:(SignalAccount *)signalAccount; -- (NSComparisonResult)compareSignalAccount:(SignalAccount *)left withSignalAccount:(SignalAccount *)right; // Generally we prefer the formattedProfileName over the raw profileName so as to // distinguish a profile name apart from a name pulled from the system's contacts. diff --git a/SignalMessaging/utils/ConversationSearcher.swift b/SignalMessaging/utils/ConversationSearcher.swift index ff72a4d55..70cb56c63 100644 --- a/SignalMessaging/utils/ConversationSearcher.swift +++ b/SignalMessaging/utils/ConversationSearcher.swift @@ -39,13 +39,13 @@ public class ConversationSearchResult: Comparable { public class ContactSearchResult: Comparable { public let signalAccount: SignalAccount - public let contactsManager: OWSContactsManager + public let contactsManager: ContactsManagerProtocol public var recipientId: String { return signalAccount.recipientId } - init(signalAccount: SignalAccount, contactsManager: OWSContactsManager) { + init(signalAccount: SignalAccount, contactsManager: ContactsManagerProtocol) { self.signalAccount = signalAccount self.contactsManager = contactsManager } @@ -53,7 +53,7 @@ public class ContactSearchResult: Comparable { // Mark: Comparable public static func < (lhs: ContactSearchResult, rhs: ContactSearchResult) -> Bool { - return lhs.contactsManager.compareSignalAccount(lhs.signalAccount, with: rhs.signalAccount) == .orderedAscending + return lhs.contactsManager.compare(signalAccount: lhs.signalAccount, with: rhs.signalAccount) == .orderedAscending } // MARK: Equatable @@ -99,7 +99,7 @@ public class ConversationSearcher: NSObject { public func results(searchText: String, transaction: YapDatabaseReadTransaction, - contactsManager: OWSContactsManager) -> SearchResultSet { + contactsManager: ContactsManagerProtocol) -> SearchResultSet { var conversations: [ConversationSearchResult] = [] var contacts: [ContactSearchResult] = [] diff --git a/SignalServiceKit/src/Protocols/ContactsManagerProtocol.h b/SignalServiceKit/src/Protocols/ContactsManagerProtocol.h index 64d7e0335..7a6e5cc69 100644 --- a/SignalServiceKit/src/Protocols/ContactsManagerProtocol.h +++ b/SignalServiceKit/src/Protocols/ContactsManagerProtocol.h @@ -17,6 +17,9 @@ NS_ASSUME_NONNULL_BEGIN - (BOOL)isSystemContact:(NSString *)recipientId; - (BOOL)isSystemContactWithSignalAccount:(NSString *)recipientId; +- (NSComparisonResult)compareSignalAccount:(SignalAccount *)left + withSignalAccount:(SignalAccount *)right NS_SWIFT_NAME(compare(signalAccount:with:)); + @end NS_ASSUME_NONNULL_END diff --git a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift index 467389ee6..afe507080 100644 --- a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift +++ b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift @@ -18,7 +18,7 @@ public class SearchIndexer { } private func normalize(indexingText: String) -> String { - return FullTextSearchFinder.sanitize(text: indexingText) + return FullTextSearchFinder.normalize(text: indexingText) } } @@ -31,19 +31,32 @@ public class FullTextSearchFinder: NSObject { // SQLite does not support suffix or contains matches. public class func query(searchText: String) -> String { // 1. Normalize the search text. - let normalizedSearchText = normalize(queryText: searchText) + let normalizedSearchText = FullTextSearchFinder.normalize(text: searchText) - // 2. Split into tokens. - let queryTerms = normalizedSearchText.split(separator: " ").filter { - // Ignore empty tokens. + // 2. Split into query terms (or tokens). + var queryTerms = normalizedSearchText.split(separator: " ") + + // 3. Add an additional numeric-only query term. + let digitsOnlyScalars = normalizedSearchText.unicodeScalars.lazy.filter { + CharacterSet.decimalDigits.contains($0) + } + let digitsOnly: Substring = Substring(String(String.UnicodeScalarView(digitsOnlyScalars))) + queryTerms.append(digitsOnly) + + // 4. De-duplicate and sort query terms. + queryTerms = Array(Set(queryTerms)).sorted() + + // 5. Filter the query terms. + let filteredQueryTerms = queryTerms.filter { + // Ignore empty terms. $0.count > 0 }.map { - // Allow partial match of each token. + // Allow partial match of each term. $0 + "*" } - // 3. Join tokens into query string. - let query = queryTerms.joined(separator: " ") + // 6. Join terms into query string. + let query = filteredQueryTerms.joined(separator: " ") return query } @@ -73,13 +86,13 @@ public class FullTextSearchFinder: NSObject { } } - // Mark: Filtering + // Mark: Normalization fileprivate class func charactersToRemove() -> CharacterSet { var charactersToFilter = CharacterSet.punctuationCharacters charactersToFilter.formUnion(CharacterSet.illegalCharacters) charactersToFilter.formUnion(CharacterSet.controlCharacters) - charactersToFilter.formUnion(CharacterSet.symbols) + charactersToFilter.formUnion(CharacterSet(charactersIn: "+~$^=|<>`")) return charactersToFilter } @@ -88,7 +101,7 @@ public class FullTextSearchFinder: NSObject { return separatorCharacters } - fileprivate class func sanitize(text: String) -> String { + public class func normalize(text: String) -> String { // 1. Filter out invalid characters. let filtered = text.unicodeScalars.lazy.filter({ !charactersToRemove().contains($0) @@ -115,21 +128,6 @@ public class FullTextSearchFinder: NSObject { return result.trimmingCharacters(in: .whitespacesAndNewlines) } - private class func normalize(queryText: String) -> String { - var normalized: String = FullTextSearchFinder.sanitize(text: queryText) - - let digitsOnlyScalars = normalized.unicodeScalars.lazy.filter { - CharacterSet.decimalDigits.contains($0) - } - let normalizedDigits = String(String.UnicodeScalarView(digitsOnlyScalars)) - - if normalizedDigits.count > 0 { - return "\(normalized) OR \(normalizedDigits)" - } else { - return normalized - } - } - // Mark: Index Building private class var contactsManager: ContactsManagerProtocol { From 153f3fc0a59b9f0e46f95fc82c8d89ac06603732 Mon Sep 17 00:00:00 2001 From: Matthew Chen Date: Thu, 14 Jun 2018 12:04:23 -0400 Subject: [PATCH 4/8] Improve search query construction. --- SignalServiceKit/src/Storage/FullTextSearchFinder.swift | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift index afe507080..bb35d8f52 100644 --- a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift +++ b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift @@ -92,7 +92,8 @@ public class FullTextSearchFinder: NSObject { var charactersToFilter = CharacterSet.punctuationCharacters charactersToFilter.formUnion(CharacterSet.illegalCharacters) charactersToFilter.formUnion(CharacterSet.controlCharacters) - charactersToFilter.formUnion(CharacterSet(charactersIn: "+~$^=|<>`")) + // Note that we strip the Unicode "subtitute" character (26). + charactersToFilter.formUnion(CharacterSet(charactersIn: "+~$^=|<>`_\u{26}")) return charactersToFilter } From 755d30254efe8272899e92d14ae6c2748514de62 Mon Sep 17 00:00:00 2001 From: Matthew Chen Date: Thu, 14 Jun 2018 13:08:44 -0400 Subject: [PATCH 5/8] Improve search query construction. --- .../src/Storage/FullTextSearchFinder.swift | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift index bb35d8f52..67e311816 100644 --- a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift +++ b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift @@ -52,7 +52,9 @@ public class FullTextSearchFinder: NSObject { $0.count > 0 }.map { // Allow partial match of each term. - $0 + "*" + // + // Note that we use double-quotes to enclose each search term. + "\"\($0)\"*" } // 6. Join terms into query string. @@ -92,8 +94,17 @@ public class FullTextSearchFinder: NSObject { var charactersToFilter = CharacterSet.punctuationCharacters charactersToFilter.formUnion(CharacterSet.illegalCharacters) charactersToFilter.formUnion(CharacterSet.controlCharacters) - // Note that we strip the Unicode "subtitute" character (26). - charactersToFilter.formUnion(CharacterSet(charactersIn: "+~$^=|<>`_\u{26}")) + + // We want to strip all ASCII characters except: + // * Letters a-z, A-Z + // * Numerals 0-9 + // * Whitespace + var asciiToFilter = CharacterSet(charactersIn: UnicodeScalar(0x0)!.. YapDatabaseFullTextSearchTransaction? { return transaction.ext(FullTextSearchFinder.dbExtensionName) as? YapDatabaseFullTextSearchTransaction From 5c42e4c59e08d551af95c8484dcd6335d6b1df98 Mon Sep 17 00:00:00 2001 From: Matthew Chen Date: Thu, 14 Jun 2018 13:49:45 -0400 Subject: [PATCH 6/8] Improve search query construction. --- Signal/test/util/SearcherTest.swift | 20 +++++++------ .../src/Storage/FullTextSearchFinder.swift | 30 +++++++++++-------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/Signal/test/util/SearcherTest.swift b/Signal/test/util/SearcherTest.swift index a8b98ca49..70006ee41 100644 --- a/Signal/test/util/SearcherTest.swift +++ b/Signal/test/util/SearcherTest.swift @@ -66,6 +66,8 @@ class FakeContactsManager: NSObject, ContactsManagerProtocol { } func compare(signalAccount left: SignalAccount, with right: SignalAccount) -> ComparisonResult { + owsFail("\(logTag) if this method ends up being used by the tests, we should provide a better implementation.") + return .orderedAscending } } @@ -347,15 +349,15 @@ class SearcherTest: XCTestCase { } func testSearchQuery() { - XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza"), "Liza*") - XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza +1-323"), "1323* Liza*") - XCTAssertEqual(FullTextSearchFinder.query(searchText: "\"\\ `~!@#$%^&*()_+-={}|[]:;'<>?,./Liza +1-323"), "1323* Liza*") - XCTAssertEqual(FullTextSearchFinder.query(searchText: "renaldo RENALDO reñaldo REÑALDO"), "RENALDO* REÑALDO* renaldo* reñaldo*") - XCTAssertEqual(FullTextSearchFinder.query(searchText: "😏"), "😏*") - XCTAssertEqual(FullTextSearchFinder.query(searchText: "alice 123 bob 456"), "123* 123456* 456* alice* bob*") - XCTAssertEqual(FullTextSearchFinder.query(searchText: "Li!za"), "Liza*") - XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza Liza"), "Liza*") - XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza liza"), "Liza* liza*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza"), "\"Liza\"*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza +1-323"), "\"1323\"* \"Liza\"*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "\"\\ `~!@#$%^&*()_+-={}|[]:;'<>?,./Liza +1-323"), "\"1323\"* \"Liza\"*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "renaldo RENALDO reñaldo REÑALDO"), "\"RENALDO\"* \"REÑALDO\"* \"renaldo\"* \"reñaldo\"*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "😏"), "\"😏\"*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "alice 123 bob 456"), "\"123\"* \"123456\"* \"456\"* \"alice\"* \"bob\"*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "Li!za"), "\"Liza\"*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza Liza"), "\"Liza\"*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza liza"), "\"Liza\"* \"liza\"*") } func testTextNormalization() { diff --git a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift index 67e311816..bf1b8c7a6 100644 --- a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift +++ b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift @@ -31,6 +31,9 @@ public class FullTextSearchFinder: NSObject { // SQLite does not support suffix or contains matches. public class func query(searchText: String) -> String { // 1. Normalize the search text. + // + // TODO: We could arguably convert to lowercase since the search + // is case-insensitive. let normalizedSearchText = FullTextSearchFinder.normalize(text: searchText) // 2. Split into query terms (or tokens). @@ -44,6 +47,9 @@ public class FullTextSearchFinder: NSObject { queryTerms.append(digitsOnly) // 4. De-duplicate and sort query terms. + // Duplicate terms are redundant. + // Sorting terms makes the output of this method deterministic and easier to test, + // and the order won't affect the search results. queryTerms = Array(Set(queryTerms)).sorted() // 5. Filter the query terms. @@ -54,6 +60,10 @@ public class FullTextSearchFinder: NSObject { // Allow partial match of each term. // // Note that we use double-quotes to enclose each search term. + // Quoted search terms can include a few more characters than + // "bareword" (non-quoted) search terms. This shouldn't matter, + // since we're filtering all of the affected characters, but + // quoting protects us from any bugs in that logic. "\"\($0)\"*" } @@ -91,6 +101,10 @@ public class FullTextSearchFinder: NSObject { // Mark: Normalization fileprivate class func charactersToRemove() -> CharacterSet { + // * We want to strip punctuation - and our definition of "punctuation" + // is broader than `CharacterSet.punctuationCharacters`. + // * FTS should be robust to (i.e. ignore) illegal and control characters, + // but it's safer if we filter them ourselves as well. var charactersToFilter = CharacterSet.punctuationCharacters charactersToFilter.formUnion(CharacterSet.illegalCharacters) charactersToFilter.formUnion(CharacterSet.controlCharacters) @@ -108,11 +122,6 @@ public class FullTextSearchFinder: NSObject { return charactersToFilter } - fileprivate class func separatorCharacters() -> CharacterSet { - let separatorCharacters = CharacterSet.whitespacesAndNewlines - return separatorCharacters - } - public class func normalize(text: String) -> String { // 1. Filter out invalid characters. let filtered = text.unicodeScalars.lazy.filter({ @@ -121,7 +130,7 @@ public class FullTextSearchFinder: NSObject { // 2. Simplify whitespace. let simplifyingFunction: (UnicodeScalar) -> UnicodeScalar = { - if separatorCharacters().contains($0) { + if CharacterSet.whitespacesAndNewlines.contains($0) { return UnicodeScalar(" ") } else { return $0 @@ -129,14 +138,9 @@ public class FullTextSearchFinder: NSObject { } let simplified = filtered.map(simplifyingFunction) - // 3. Combine adjacent whitespace. - var result = String(String.UnicodeScalarView(simplified)) - while result.contains(" ") { - result = result.replacingOccurrences(of: " ", with: " ") - } - - // 4. Strip leading & trailing whitespace last, since we may replace + // 3. Strip leading & trailing whitespace last, since we may replace // filtered characters with whitespace. + var result = String(String.UnicodeScalarView(simplified)) return result.trimmingCharacters(in: .whitespacesAndNewlines) } From 527e2715d4cbb265ed60cebe3eb9dc89212ee659 Mon Sep 17 00:00:00 2001 From: Matthew Chen Date: Thu, 14 Jun 2018 15:24:04 -0400 Subject: [PATCH 7/8] Elaborate the search tests. --- Signal/test/util/SearcherTest.swift | 112 ++++++++++++++++-- .../src/Storage/FullTextSearchFinder.swift | 13 +- 2 files changed, 111 insertions(+), 14 deletions(-) diff --git a/Signal/test/util/SearcherTest.swift b/Signal/test/util/SearcherTest.swift index 70006ee41..d8e1aa5c0 100644 --- a/Signal/test/util/SearcherTest.swift +++ b/Signal/test/util/SearcherTest.swift @@ -66,7 +66,7 @@ class FakeContactsManager: NSObject, ContactsManagerProtocol { } func compare(signalAccount left: SignalAccount, with right: SignalAccount) -> ComparisonResult { - owsFail("\(logTag) if this method ends up being used by the tests, we should provide a better implementation.") + owsFail("if this method ends up being used by the tests, we should provide a better implementation.") return .orderedAscending } @@ -137,6 +137,12 @@ class ConversationSearcherTest: XCTestCase { let goodbyeBookClub = TSOutgoingMessage(in: bookClubGroupThread, messageBody: "Goodbye Book Club", attachmentId: nil) goodbyeBookClub.save(with: transaction) + + let bobsPhoneNumber = TSOutgoingMessage(in: bookClubGroupThread, messageBody: "My phone number is: 321-321-4321", attachmentId: nil) + bobsPhoneNumber.save(with: transaction) + + let bobsFaxNumber = TSOutgoingMessage(in: bookClubGroupThread, messageBody: "My fax is: 222-333-4444", attachmentId: nil) + bobsFaxNumber.save(with: transaction) } } @@ -187,17 +193,17 @@ class ConversationSearcherTest: XCTestCase { // Exact match threads = searchConversations(searchText: aliceRecipientId) XCTAssertEqual(3, threads.count) - XCTAssertEqual([bookClubThread, snackClubThread, aliceThread], threads) + XCTAssertEqual([bookClubThread, aliceThread, snackClubThread], threads) // Partial match threads = searchConversations(searchText: "+123456") XCTAssertEqual(3, threads.count) - XCTAssertEqual([bookClubThread, snackClubThread, aliceThread], threads) + XCTAssertEqual([bookClubThread, aliceThread, snackClubThread], threads) // Prefixes threads = searchConversations(searchText: "12345678900") XCTAssertEqual(3, threads.count) - XCTAssertEqual([bookClubThread, snackClubThread, aliceThread], threads) + XCTAssertEqual([bookClubThread, aliceThread, snackClubThread], threads) threads = searchConversations(searchText: "49") XCTAssertEqual(1, threads.count) @@ -205,15 +211,19 @@ class ConversationSearcherTest: XCTestCase { threads = searchConversations(searchText: "1-234-56") XCTAssertEqual(3, threads.count) - XCTAssertEqual([bookClubThread, snackClubThread, aliceThread], threads) + XCTAssertEqual([bookClubThread, aliceThread, snackClubThread], threads) threads = searchConversations(searchText: "123456") XCTAssertEqual(3, threads.count) - XCTAssertEqual([bookClubThread, snackClubThread, aliceThread], threads) + XCTAssertEqual([bookClubThread, aliceThread, snackClubThread], threads) threads = searchConversations(searchText: "1.234.56") XCTAssertEqual(3, threads.count) - XCTAssertEqual([bookClubThread, snackClubThread, aliceThread], threads) + XCTAssertEqual([bookClubThread, aliceThread, snackClubThread], threads) + + threads = searchConversations(searchText: "1 234 56") + XCTAssertEqual(3, threads.count) + XCTAssertEqual([bookClubThread, aliceThread, snackClubThread], threads) } func testSearchContactByNumberWithoutCountryCode() { @@ -221,11 +231,11 @@ class ConversationSearcherTest: XCTestCase { // Phone Number formatting should be forgiving threads = searchConversations(searchText: "234.56") XCTAssertEqual(3, threads.count) - XCTAssertEqual([bookClubThread, snackClubThread, aliceThread], threads) + XCTAssertEqual([bookClubThread, aliceThread, snackClubThread], threads) threads = searchConversations(searchText: "234 56") XCTAssertEqual(3, threads.count) - XCTAssertEqual([bookClubThread, snackClubThread, aliceThread], threads) + XCTAssertEqual([bookClubThread, aliceThread, snackClubThread], threads) } func testSearchConversationByContactByName() { @@ -233,7 +243,7 @@ class ConversationSearcherTest: XCTestCase { threads = searchConversations(searchText: "Alice") XCTAssertEqual(3, threads.count) - XCTAssertEqual([bookClubThread, snackClubThread, aliceThread], threads) + XCTAssertEqual([bookClubThread, aliceThread, snackClubThread], threads) threads = searchConversations(searchText: "Bob") XCTAssertEqual(1, threads.count) @@ -261,6 +271,86 @@ class ConversationSearcherTest: XCTestCase { XCTAssert(resultSet.messages.map { $0.thread }.contains(bookClubThread)) } + func testSearchEdgeCases() { + var resultSet: SearchResultSet = .empty + + resultSet = getResultSet(searchText: "Hello Alice") + XCTAssertEqual(1, resultSet.messages.count) + XCTAssertEqual(["Hello Alice"], bodies(forMessageResults: resultSet.messages)) + + resultSet = getResultSet(searchText: "hello alice") + XCTAssertEqual(1, resultSet.messages.count) + XCTAssertEqual(["Hello Alice"], bodies(forMessageResults: resultSet.messages)) + + resultSet = getResultSet(searchText: "Hel") + XCTAssertEqual(2, resultSet.messages.count) + XCTAssertEqual(["Hello Alice", "Hello Book Club"], bodies(forMessageResults: resultSet.messages)) + + resultSet = getResultSet(searchText: "Hel Ali") + XCTAssertEqual(1, resultSet.messages.count) + XCTAssertEqual(["Hello Alice"], bodies(forMessageResults: resultSet.messages)) + + resultSet = getResultSet(searchText: "Hel Ali Alic") + XCTAssertEqual(1, resultSet.messages.count) + XCTAssertEqual(["Hello Alice"], bodies(forMessageResults: resultSet.messages)) + + resultSet = getResultSet(searchText: "Ali Hel") + XCTAssertEqual(1, resultSet.messages.count) + XCTAssertEqual(["Hello Alice"], bodies(forMessageResults: resultSet.messages)) + + resultSet = getResultSet(searchText: "CLU") + XCTAssertEqual(2, resultSet.messages.count) + XCTAssertEqual(["Goodbye Book Club", "Hello Book Club"], bodies(forMessageResults: resultSet.messages)) + + resultSet = getResultSet(searchText: "hello !@##!@#!$^@!@#! alice") + XCTAssertEqual(1, resultSet.messages.count) + XCTAssertEqual(["Hello Alice"], bodies(forMessageResults: resultSet.messages)) + + resultSet = getResultSet(searchText: "3213 phone") + XCTAssertEqual(1, resultSet.messages.count) + XCTAssertEqual(["My phone number is: 321-321-4321"], bodies(forMessageResults: resultSet.messages)) + + resultSet = getResultSet(searchText: "PHO 3213") + XCTAssertEqual(1, resultSet.messages.count) + XCTAssertEqual(["My phone number is: 321-321-4321"], bodies(forMessageResults: resultSet.messages)) + + resultSet = getResultSet(searchText: "fax") + XCTAssertEqual(1, resultSet.messages.count) + XCTAssertEqual(["My fax is: 222-333-4444"], bodies(forMessageResults: resultSet.messages)) + + resultSet = getResultSet(searchText: "fax 2223") + XCTAssertEqual(1, resultSet.messages.count) + XCTAssertEqual(["My fax is: 222-333-4444"], bodies(forMessageResults: resultSet.messages)) + } + + func bodies(forMessageResults messageResults: [ConversationSearchResult]) -> [String] { + var result = [String]() + + self.dbConnection.read { transaction in + for messageResult in messageResults { + guard let messageId = messageResult.messageId else { + owsFail("message result missing message id") + continue + } + guard let interaction = TSInteraction.fetch(uniqueId: messageId, transaction: transaction) else { + owsFail("couldn't load interaction for message result") + continue + } + guard let message = interaction as? TSMessage else { + owsFail("invalid message for message result") + continue + } + guard let messageBody = message.body else { + owsFail("message result missing message body") + continue + } + result.append(messageBody) + } + } + + return result.sorted() + } + // Mark: Helpers private func searchConversations(searchText: String) -> [ThreadViewModel] { @@ -354,7 +444,7 @@ class SearcherTest: XCTestCase { XCTAssertEqual(FullTextSearchFinder.query(searchText: "\"\\ `~!@#$%^&*()_+-={}|[]:;'<>?,./Liza +1-323"), "\"1323\"* \"Liza\"*") XCTAssertEqual(FullTextSearchFinder.query(searchText: "renaldo RENALDO reñaldo REÑALDO"), "\"RENALDO\"* \"REÑALDO\"* \"renaldo\"* \"reñaldo\"*") XCTAssertEqual(FullTextSearchFinder.query(searchText: "😏"), "\"😏\"*") - XCTAssertEqual(FullTextSearchFinder.query(searchText: "alice 123 bob 456"), "\"123\"* \"123456\"* \"456\"* \"alice\"* \"bob\"*") + XCTAssertEqual(FullTextSearchFinder.query(searchText: "alice 123 bob 456"), "\"123456\"* \"alice\"* \"bob\"*") XCTAssertEqual(FullTextSearchFinder.query(searchText: "Li!za"), "\"Liza\"*") XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza Liza"), "\"Liza\"*") XCTAssertEqual(FullTextSearchFinder.query(searchText: "Liza liza"), "\"Liza\"* \"liza\"*") diff --git a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift index bf1b8c7a6..82b71edaa 100644 --- a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift +++ b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift @@ -36,8 +36,15 @@ public class FullTextSearchFinder: NSObject { // is case-insensitive. let normalizedSearchText = FullTextSearchFinder.normalize(text: searchText) - // 2. Split into query terms (or tokens). - var queryTerms = normalizedSearchText.split(separator: " ") + // 2. Split the non-numeric text into query terms (or tokens). + let nonNumericText = String(String.UnicodeScalarView(normalizedSearchText.unicodeScalars.lazy.map { + if CharacterSet.decimalDigits.contains($0) { + return " " + } else { + return $0 + } + })) + var queryTerms = nonNumericText.split(separator: " ") // 3. Add an additional numeric-only query term. let digitsOnlyScalars = normalizedSearchText.unicodeScalars.lazy.filter { @@ -140,7 +147,7 @@ public class FullTextSearchFinder: NSObject { // 3. Strip leading & trailing whitespace last, since we may replace // filtered characters with whitespace. - var result = String(String.UnicodeScalarView(simplified)) + let result = String(String.UnicodeScalarView(simplified)) return result.trimmingCharacters(in: .whitespacesAndNewlines) } From c8f2201a3740e9316690f9e80eabfde1d0c9f66f Mon Sep 17 00:00:00 2001 From: Matthew Chen Date: Thu, 14 Jun 2018 16:33:27 -0400 Subject: [PATCH 8/8] Respond to CR. --- .../src/Storage/FullTextSearchFinder.swift | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift index 82b71edaa..ab1042aad 100644 --- a/SignalServiceKit/src/Storage/FullTextSearchFinder.swift +++ b/SignalServiceKit/src/Storage/FullTextSearchFinder.swift @@ -25,7 +25,7 @@ public class SearchIndexer { @objc public class FullTextSearchFinder: NSObject { - // Mark: Querying + // MARK: - Querying // We want to match by prefix for "search as you type" functionality. // SQLite does not support suffix or contains matches. @@ -99,15 +99,15 @@ public class FullTextSearchFinder: NSObject { stop.pointee = true return } - searchResultCount = searchResultCount + 1 + searchResultCount += 1 block(object, snippet) } } - // Mark: Normalization + // MARK: - Normalization - fileprivate class func charactersToRemove() -> CharacterSet { + fileprivate static var charactersToRemove: CharacterSet = { // * We want to strip punctuation - and our definition of "punctuation" // is broader than `CharacterSet.punctuationCharacters`. // * FTS should be robust to (i.e. ignore) illegal and control characters, @@ -127,12 +127,12 @@ public class FullTextSearchFinder: NSObject { charactersToFilter.formUnion(asciiToFilter) return charactersToFilter - } + }() public class func normalize(text: String) -> String { // 1. Filter out invalid characters. let filtered = text.unicodeScalars.lazy.filter({ - !charactersToRemove().contains($0) + !charactersToRemove.contains($0) }) // 2. Simplify whitespace. @@ -151,7 +151,7 @@ public class FullTextSearchFinder: NSObject { return result.trimmingCharacters(in: .whitespacesAndNewlines) } - // Mark: Index Building + // MARK: - Index Building private class var contactsManager: ContactsManagerProtocol { return TextSecureKitEnv.shared().contactsManager