From 174ac6d7e3699adc2f008ceef35d820d02fb8aaf Mon Sep 17 00:00:00 2001
From: Matteo Pagliazzi <matteopagliazzi@gmail.com>
Date: Mon, 9 Nov 2020 11:34:28 +0100
Subject: [PATCH] =?UTF-8?q?Revert=20"Revert=20"fix(banned=20words):=20fix?=
 =?UTF-8?q?=20partial=20matching=20of=20words=20containing=20diacritic?=
 =?UTF-8?q?=E2=80=A6=20(#12444)""?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 5362058f3552d2c410638372fd48bb9a91861d2b.
---
 test/api/unit/libs/stringUtils.test.js |  5 +++++
 website/server/libs/stringUtils.js     | 11 ++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/test/api/unit/libs/stringUtils.test.js b/test/api/unit/libs/stringUtils.test.js
index 0596cd16b9..947274df53 100644
--- a/test/api/unit/libs/stringUtils.test.js
+++ b/test/api/unit/libs/stringUtils.test.js
@@ -8,5 +8,10 @@ describe('stringUtils', () => {
       const matches = getMatchesByWordArray(message, bannedWords);
       expect(matches.length).to.equal(bannedWords.length);
     });
+    it('doesn\'t flag names with accented characters', () => {
+      const name = 'TESTPLACEHOLDERSWEARWORDHEREé';
+      const matches = getMatchesByWordArray(name, bannedWords);
+      expect(matches.length).to.equal(0);
+    });
   });
 });
diff --git a/website/server/libs/stringUtils.js b/website/server/libs/stringUtils.js
index a7e4b08aa8..39f75356c5 100644
--- a/website/server/libs/stringUtils.js
+++ b/website/server/libs/stringUtils.js
@@ -4,12 +4,21 @@ export function removePunctuationFromString (str) {
 
 // NOTE: the wordsToMatch aren't escaped in order to support regular expressions,
 // so this method should not be used if wordsToMatch contains unsanitized user input
+
 export function getMatchesByWordArray (str, wordsToMatch) {
+  // remove accented characters from the string, which would trip up the regEx
+  // later on, by using the built-in Unicode normalisation methods
+  // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/normalize
+  // https://www.unicode.org/reports/tr15/#Canon_Compat_Equivalence
+  // https://unicode-table.com/en/#combining-diacritical-marks
+
+  const normalizedStr = str.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
+
   const matchedWords = [];
   const wordRegexs = wordsToMatch.map(word => new RegExp(`\\b([^a-z]+)?${word}([^a-z]+)?\\b`, 'i'));
   for (let i = 0; i < wordRegexs.length; i += 1) {
     const regEx = wordRegexs[i];
-    const match = str.match(regEx);
+    const match = normalizedStr.match(regEx);
     if (match !== null && match[0] !== null) {
       const trimmedMatch = removePunctuationFromString(match[0]).trim();
       matchedWords.push(trimmedMatch);