From 174ac6d7e3699adc2f008ceef35d820d02fb8aaf Mon Sep 17 00:00:00 2001 From: Matteo Pagliazzi Date: Mon, 9 Nov 2020 11:34:28 +0100 Subject: [PATCH] =?UTF-8?q?Revert=20"Revert=20"fix(banned=20words):=20fix?= =?UTF-8?q?=20partial=20matching=20of=20words=20containing=20diacritic?= =?UTF-8?q?=E2=80=A6=20(#12444)""?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 5362058f3552d2c410638372fd48bb9a91861d2b. --- test/api/unit/libs/stringUtils.test.js | 5 +++++ website/server/libs/stringUtils.js | 11 ++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/test/api/unit/libs/stringUtils.test.js b/test/api/unit/libs/stringUtils.test.js index 0596cd16b9..947274df53 100644 --- a/test/api/unit/libs/stringUtils.test.js +++ b/test/api/unit/libs/stringUtils.test.js @@ -8,5 +8,10 @@ describe('stringUtils', () => { const matches = getMatchesByWordArray(message, bannedWords); expect(matches.length).to.equal(bannedWords.length); }); + it('doesn\'t flag names with accented characters', () => { + const name = 'TESTPLACEHOLDERSWEARWORDHEREé'; + const matches = getMatchesByWordArray(name, bannedWords); + expect(matches.length).to.equal(0); + }); }); }); diff --git a/website/server/libs/stringUtils.js b/website/server/libs/stringUtils.js index a7e4b08aa8..39f75356c5 100644 --- a/website/server/libs/stringUtils.js +++ b/website/server/libs/stringUtils.js @@ -4,12 +4,21 @@ export function removePunctuationFromString (str) { // NOTE: the wordsToMatch aren't escaped in order to support regular expressions, // so this method should not be used if wordsToMatch contains unsanitized user input + export function getMatchesByWordArray (str, wordsToMatch) { + // remove accented characters from the string, which would trip up the regEx + // later on, by using the built-in Unicode normalisation methods + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/normalize + // https://www.unicode.org/reports/tr15/#Canon_Compat_Equivalence + // https://unicode-table.com/en/#combining-diacritical-marks + + const normalizedStr = str.normalize('NFD').replace(/[\u0300-\u036f]/g, ''); + const matchedWords = []; const wordRegexs = wordsToMatch.map(word => new RegExp(`\\b([^a-z]+)?${word}([^a-z]+)?\\b`, 'i')); for (let i = 0; i < wordRegexs.length; i += 1) { const regEx = wordRegexs[i]; - const match = str.match(regEx); + const match = normalizedStr.match(regEx); if (match !== null && match[0] !== null) { const trimmedMatch = removePunctuationFromString(match[0]).trim(); matchedWords.push(trimmedMatch);