From 4b80a5ad3011cdb5350606d75fc8c830fe386f8c Mon Sep 17 00:00:00 2001
From: Navy <navydotgif@gmail.com>
Date: Wed, 30 Sep 2020 19:32:48 +0300
Subject: [PATCH] experimental word filter implementation

---
 .../components/observers/Automoderation.js    | 127 +++++++++++++++++-
 1 file changed, 121 insertions(+), 6 deletions(-)

diff --git a/structure/client/components/observers/Automoderation.js b/structure/client/components/observers/Automoderation.js
index 574283c..50bc489 100644
--- a/structure/client/components/observers/Automoderation.js
+++ b/structure/client/components/observers/Automoderation.js
@@ -1,13 +1,16 @@
-const { Observer } = require('../../../interfaces/');
+const similarity = require('similarity');
+
+const { Observer, BinaryTree } = require('../../../interfaces');
+const { FilterUtil, FilterPresets } = require('../../../../util');
 
 const CONSTANTS = {};
 
-class Automoderation extends Observer {
+module.exports = class AutoModeration extends Observer {
 
     constructor(client) {
 
         super(client, {
-            name: 'automoderation',
+            name: 'autoModeration',
             priority: 1
         });
 
@@ -21,10 +24,124 @@ class Automoderation extends Observer {
             ['message', this.filterMentions.bind(this)]
         ];
 
+        this.whitelist = new BinaryTree(this.client, FilterPresets.whitelist);
+
     }
 
     async filterWords(message, edited) {
 
+        const { guild, author, channel } = message;
+        if (!guild || author.bot) return;
+
+        const member = message.member || await guild.members.fetch(author.id).catch();
+        const settings = await guild.settings();
+        const setting = settings.wordFilter;
+        const { bypass, ignore, enabled, silent, explicit, fuzzy, tokenized, whitelist, actions, presets } = setting;
+        const roles = member.roles.cache.map((r) => r.id);
+
+        if (!enabled || roles.some((r) => bypass.includes(r.id)) || ignore.includes(channel.id)) return;
+
+        // Which message obj to work with
+        const msg = edited || message;
+        this.client.logger.debug(`Pre norm:\n${msg.cleanContent}`);
+        const content = FilterUtil.normalize(msg.cleanContent);
+        this.client.logger.debug(`Normalized\n${content}`);
+
+        let result = { match: null, matched: false, matcher: null, preset: false };
+        const words = content.toLowerCase().split(' ').filter((elem) => elem.length);
+        // Remove any potential bypass characters
+        const _words = words.map((word) => word.replace(/[.'*]/gu, ''));
+
+        // 1. Filter for preset lists
+        if (presets.length) {
+            for (const preset of presets) {
+
+                const text = _words.join('').replace(/\s/u, ''); //Also check for spaced out words, ex "f u c k"
+                //Combine array of presets to one expression
+                const regex = new RegExp(`(${FilterPresets[preset].join(')|(')})`, 'ui');
+                const match = content.match(regex) || text.length === words.length ? text.match(regex) : null;
+                if (!match) continue;
+                this.client.logger.debug(`Message matched with "${preset}" preset list.\nMatch: ${match[0]}\nFull content: ${content}`);
+                result = { match: match[0], matched: true, matcher: preset, preset: true };
+                break;
+
+            }
+        }
+
+        // 2. Filter explicit - no bypass checking (unless you count normalising the text, i.e. emoji letters => normal letters)
+        if (explicit.length && !result.matched) {
+
+            for (const word of explicit) {
+                //Do it like this instead of regex so it doesn't match stuff like Scunthorpe with cunt
+                if (words.some((_word) => _word === word)) {
+                    this.client.logger.debug(`Message matched with "${word}" in the explicit list.\nFull content: ${content}`);
+                    result = { match: word, matched: true, matcher: 'explicit', preset: false };
+                }
+
+            }
+
+        }
+
+        // 3. Filter fuzzy
+        if (fuzzy.length && !result.matched) {
+
+            const text = words.join('').replace(/\s/u, '');
+            const threshold = 0.93 - 0.165 * Math.log(text.length);
+
+            outer:
+            for (const _word of fuzzy) {
+
+                for (const word of words) {
+                    const sim = similarity(word, _word);
+                    const threshold = 0.93 - 0.165 * Math.log(word.length);
+                    if (sim >= threshold) {
+                        if (this.whitelist.find(word) || whitelist.some((w) => w === word) && sim < 1) continue;
+                        this.client.logger.debug(`Message matched with "${_word}" in fuzzy.\nMatched word: ${word}\nFull content: ${content}\nSimilarity: ${sim}\nThreshold: ${threshold}`);
+                        result = { match: word, matched: true, _matcher: _word, matcher: `fuzzy [\`${_word}\`, \`${sim}\`, \`${threshold}\`]`, preset: false };
+                        break outer;
+                    }
+
+                }
+
+                const sim = similarity(text, _word);
+                if (sim >= threshold) {
+                    if (this.whitelist.find(text) || whitelist.some((w) => w === text) && sim < 1) continue;
+                    this.client.logger.debug(`Message matched with "${_word}" in fuzzy.\nMatched word: ${text}\nFull content: ${content}\nSimilarity: ${sim}\nThreshold: ${threshold}`);
+                    result = { match: text, matched: true, _matcher: _word, matcher: `fuzzy [\`${_word}\`, \`${sim}\`, \`${threshold}\`]`, preset: false };
+                    break;
+                }
+
+                this.client.logger.debug(`Message did not match with "${_word}" in fuzzy.\nFull content: ${content}\nSimilarity: ${sim}\nThreshold: ${threshold}`);
+
+            }
+
+        }
+
+        // 4. Filter tokenized
+        if (tokenized.length && !result.matched) {
+
+            for (const word of explicit) {
+                //Do it like this instead of regex so it doesn't match stuff like Scunthorpe with cunt
+                if (content.toLowerCase().includes(word)) {
+                    this.client.logger.debug(`Message matched with "${word}" in the tokenized list.\nFull content: ${content}`);
+                    result = { match: word, matched: true, matcher: 'tokenized', preset: false };
+                }
+
+            }
+
+        }
+
+        // 5. Remove message, inline response and add a reason to msg object
+        if (!result.matched) return;
+        msg.filtered = result;
+        await msg.delete();
+        if (!silent) {
+            const res = await msg.formattedRespond('W_FILTER_DELETE', { params: { user: author.id } });
+            res.delete({ timeout: 10000 });
+        }
+
+        // 6. Automated actions
+
     }
 
     async filterLinks(message, edited) {
@@ -39,6 +156,4 @@ class Automoderation extends Observer {
         
     }
 
-}
-
-module.exports = Automoderation;
\ No newline at end of file
+};
\ No newline at end of file