#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "translate.h" // alphabet translation tables
#include "cleanl.h" // header file for clean language class
#include "gestalt.h"
// Database.
static const unsigned char data[] =
{
224,105,247,13,163,30,149,0,90,178,30,149,30,150,37,0,224,105,
247,13,163,54,195,180,0,103,71,205,30,190,37,163,54,195,180,0,
224,105,247,13,163,0,190,37,163,30,8,103,71,205,30,0,224,150,
178,54,90,0,150,90,247,105,105,0,224,184,247,150,150,222,0,163,
54,90,90,30,195,0,224,84,247,103,103,150,178,54,90,0,69,71,
71,8,69,71,71,0,224,190,71,90,178,30,149,105,247,13,163,30,
149,0,105,37,190,54,103,222,8,180,247,222,0,224,37,150,150,178,
71,103,30,0,84,71,43,71,0,224,190,71,90,178,30,149,105,247,
13,163,54,195,180,0,105,247,195,8,150,30,30,163,54,195,180,0,
224,84,54,90,13,178,0,105,30,190,37,103,30,8,69,71,180,0,
224,13,71,13,163,150,247,13,163,30,149,0,184,103,30,37,150,247,
149,30,8,180,54,205,30,149,0,224,105,247,13,163,30,69,0,184,
30,195,30,90,149,37,90,30,69,0,224,105,247,13,163,30,69,0,
54,195,8,149,30,37,103,103,222,8,69,30,30,184,8,90,149,71,
247,84,103,30,0,224,37,150,150,178,71,103,30,0,84,54,180,8,
69,71,195,163,30,222,0,224,84,54,90,13,178,0,195,54,13,30,
8,180,247,222,0,224,105,247,163,0,150,178,71,205,30,0,224,105,
247,13,0,150,178,71,205,30,0,224,195,54,180,180,30,149,0,180,
149,30,37,90,8,180,247,222,0,224,184,247,150,150,222,0,195,54,
13,30,8,180,247,222,0,224,150,178,54,90,0,13,149,37,184,0,
224,150,178,222,90,0,13,149,37,184,0,224,195,30,180,30,149,0,
195,54,13,30,8,180,247,222,0,184,149,54,13,163,0,150,178,37,
149,184,8,184,37,54,195,0,178,30,103,103,0,178,30,13,163,0,
69,37,190,195,0,69,37,149,195,0,13,247,195,90,0,163,54,90,
90,222,0,184,54,150,150,0,128,71,151,244,0,150,178,54,90,178,
30,37,69,0,184,54,195,178,30,37,69,0,180,71,69,69,37,190,
195,30,69,0,180,71,150,178,8,69,37,149,195,0,84,54,90,13,
178,54,195,180,0,200,178,54,195,54,195,180,0,84,37,150,90,37,
149,69,0,105,37,90,178,30,149,103,30,150,150,0,37,150,150,0,
69,71,195,163,30,222,0,13,103,54,90,0,90,71,222,0,150,247,
13,163,150,0,54,150,8,103,37,190,30,0,13,71,13,163,0,149,
71,71,150,90,30,149,0,69,54,13,163,0,134,54,13,178,37,149,
69,0,69,54,13,163,178,30,37,69,0,134,54,13,178,37,149,69,
155,150,8,195,71,180,180,54,195,0,13,247,190,0,103,54,22,247,
54,69,0,84,103,71,200,239,71,84,0,54,195,105,103,37,90,54,
71,195,37,149,222,8,30,190,184,103,71,222,190,30,195,90,0,90,
54,90,0,84,71,71,84,54,30,0,90,54,90,150,0,184,54,103,
103,71,200,150,0,84,37,103,103,150,0,84,149,37,205,30,149,222,
0,205,37,180,54,195,37,0,103,37,149,180,30,8,71,184,30,195,
54,195,180,0,184,30,195,54,150,0,69,71,103,184,178,54,195,0,
71,149,180,37,190,150,0,71,149,54,180,37,190,54,0,84,149,30,
37,150,90,150,0,13,178,54,13,163,30,195,150,0,84,149,30,37,
150,90,0,150,71,105,90,84,37,103,103,0,30,13,150,90,37,13,
222,0,178,37,184,184,222,8,69,37,222,150,0,195,54,184,184,103,
30,0,184,37,13,54,105,54,30,149,0,150,247,13,163,103,30,69,
0,69,149,37,195,163,0,190,37,195,178,71,71,69,0,90,178,30,
8,84,54,180,8,71,195,30,0,84,54,71,90,13,178,0,195,54,
13,30,8,180,247,222,0,84,54,37,90,13,178,0,195,54,13,30,
8,180,247,222,0,84,247,195,180,0,195,71,150,30,0,13,178,54,
180,37,84,71,71,0,195,54,13,30,8,180,247,222,0,13,178,54,
195,13,0,195,54,13,30,8,180,247,222,0,13,178,54,195,163,0,
195,54,13,30,8,180,247,222,0,13,103,54,90,0,84,54,180,8,
90,71,30,0,13,71,13,163,0,84,54,180,8,90,71,30,0,13,
247,190,0,150,184,37,190,0,150,247,13,163,0,84,149,30,37,90,
178,30,8,90,178,149,71,247,180,178,8,37,8,150,90,149,37,200,
0,13,247,195,195,54,103,54,195,180,247,150,0,195,71,150,30,8,
149,247,84,84,54,195,180,0,13,247,195,90,0,195,71,150,90,149,
54,103,0,69,54,13,163,0,84,54,180,8,90,71,30,0,69,54,
103,69,71,0,90,71,37,150,90,30,149,0,69,222,163,30,0,195,
54,13,30,8,180,247,222,0,105,37,180,0,195,54,13,30,8,180,
247,222,0,105,30,103,103,37,90,54,71,0,163,54,150,150,54,195,
180,0,180,71,69,69,0,180,71,103,103,222,0,239,37,13,163,71,
0,195,54,13,30,8,180,247,222,0,239,30,149,163,71,0,195,54,
13,30,8,180,247,222,0,239,30,149,163,0,195,54,13,30,8,180,
247,222,0,239,54,180,37,84,71,71,0,195,54,13,30,8,180,247,
222,0,239,54,150,190,0,150,184,37,190,0,239,54,43,0,150,184,
37,190,0,163,54,163,30,0,195,54,13,30,8,180,247,222,0,163,
103,54,90,0,195,71,150,90,149,54,103,0,163,247,195,90,0,195,
54,13,30,8,180,247,222,0,163,222,163,30,0,195,54,13,30,8,
180,247,222,0,103,30,150,84,71,0,195,54,13,30,8,180,247,222,
0,103,30,43,84,71,0,195,54,13,30,8,180,247,222,0,195,54,
180,180,0,195,54,13,30,8,180,247,222,0,184,30,195,54,150,0,
84,54,180,8,90,71,30,0,184,54,150,150,0,184,247,163,30,0,
184,178,247,0,150,178,71,205,30,0,184,149,54,13,163,0,195,54,
13,30,8,180,247,222,0,150,195,37,90,13,178,0,195,71,150,90,
149,54,103,0,150,184,30,149,190,0,239,30,103,103,71,0,150,184,
54,13,0,195,54,13,30,8,180,247,222,0,90,54,90,0,150,184,
37,149,149,71,200,0,90,200,37,90,0,195,71,150,90,149,54,103,
0,205,37,180,54,195,0,195,71,150,90,149,54,103,0,200,71,184,
0,195,54,13,30,8,180,247,222,0,37,149,150,13,178,0,69,71,
195,163,30,222,0,84,103,37,150,30,195,0,180,30,149,190,37,195,
8,200,71,149,69,0,84,247,190,150,30,195,0,180,30,149,190,37,
195,8,200,71,149,69,0,105,54,13,163,0,150,178,71,205,30,0,
105,54,13,163,30,195,0,180,30,149,190,37,195,8,200,71,149,69,
0,105,54,13,163,30,149,0,180,30,149,190,37,195,8,200,71,149,
69,0,105,247,90,43,30,0,180,30,149,190,37,195,8,200,71,149,
69,0,180,71,90,90,205,30,149,69,37,190,190,90,0,180,71,103,
103,222,0,178,54,90,103,30,149,0,150,190,247,149,105,0,178,247,
149,30,0,195,54,13,30,8,180,247,222,0,178,247,149,30,150,71,
178,195,0,195,54,13,30,8,180,247,222,0,54,90,37,163,30,149,
0,180,30,149,190,37,195,8,200,71,149,69,0,163,37,105,105,30,
149,0,180,30,149,190,37,195,8,200,71,149,69,0,163,103,54,90,
71,149,54,150,0,195,71,150,90,149,54,103,0,103,30,150,84,30,
0,195,54,13,30,8,180,247,222,0,195,37,43,54,0,195,54,13,
30,8,180,247,222,0,195,247,90,90,30,0,180,30,149,190,37,195,
8,200,71,149,69,0,184,54,30,190,30,103,0,180,30,149,190,37,
195,8,200,71,149,69,0,184,54,190,190,30,103,0,180,30,149,190,
37,195,8,200,71,149,69,0,184,54,150,150,30,0,184,247,163,30,
0,184,54,150,150,30,195,0,184,247,163,54,195,180,0,184,54,150,
150,30,149,0,195,54,13,30,8,180,247,222,0,184,71,103,37,163,
30,0,195,54,13,30,8,180,247,222,0,150,13,178,103,37,190,184,
30,0,180,30,149,190,37,195,8,200,71,149,69,0,150,13,178,200,
37,195,43,0,84,54,180,8,90,71,30,0,150,13,178,200,30,54,
195,0,195,54,13,30,8,180,247,222,0,150,13,178,200,247,13,178,
90,30,103,0,180,30,149,190,37,195,8,200,71,149,69,0,150,13,
178,200,30,103,0,180,30,149,190,37,195,8,200,71,149,69,0,150,
13,178,200,247,103,54,195,0,180,30,149,190,37,195,8,200,71,149,
69,0,150,184,30,149,190,37,0,150,184,37,190,0,90,54,90,90,
30,0,150,184,37,149,149,71,200,0,200,54,13,178,150,30,0,180,
30,149,190,37,195,8,200,71,149,69,0,
};
CleanLanguage::CleanLanguage(void)
{
mTranslate = new Translate(13); // build ascii translation table
// Place contents of database into an STL vector of strings.
const unsigned char *foo = data;
while ( *foo )
{
String str = (char *)foo;
mWords.push_back(str);
foo+=str.size()+1; // advance to next string.
}
// get translated wildcard character.
mWildChar = mTranslate->TranslateIn( (unsigned char) '*' );
}
CleanLanguage::~CleanLanguage(void)
{
delete mTranslate;
}
// True if alphabetic character
bool CleanLanguage::IsAlpha(char c)
{
if ( c >= 'a' && c <= 'z' ) return true;
if ( c >= 'A' && c <= 'Z' ) return true;
return false;
}
// Add a character to cleaned up output.
bool CleanLanguage::Add(char c,char *clean,int &index,int maxlen)
{
if ( index < (maxlen-1) )
{
clean[index] = c;
index++;
clean[index] = 0;
return true;
}
return false;
}
// will analyze an input string for foul language, and will replace
// objectionable language with non-offending ascii text in the ouput
// string. Returns count of the number of offending words found.
int CleanLanguage::WashYourMouth(const String &input,String &output) const
{
int l = input.size();
if ( !l ) return 0;
int dirtycount = 0;
int maxlen = l*8; // maximum size output version could ever be.
char *clean = new char[maxlen];
char *clean_alloc = clean;
int wordlen = maxlen;
if ( wordlen < 256 ) wordlen = 256;
char *word = new char[wordlen];
*clean = 0;
int i = 0;
const char *dirty = input.c_str();
while ( *dirty )
{
while ( !IsAlpha(*dirty) && *dirty )
{
Add(*dirty,clean,i,maxlen); // add non-alphabetic characters.
dirty++;
}
char *wordat = word;
while ( IsAlpha(*dirty) && *dirty )
{
*wordat++ = *dirty++;
}
*wordat = 0;
// grab sequence of alphabetic characters and then analyze them.
String wordin = word;
String wordout;
int match = CheckWord(wordin,wordout);
if ( match ) dirtycount++;
const char *foo = wordout.c_str();
while ( *foo )
{
Add(*foo++,clean,i,maxlen);
}
}
output = clean_alloc;
delete word;
delete clean_alloc;
return dirtycount;
}
// See if the word is 'bad' and return a cleaned up version in good.
// Returns percentage match if any.
int CleanLanguage::CheckWord(const String &bad,String &good) const
{
int size = bad.size();
if ( size >= 255 ) // we don't process words more than 255 characters long
{
good = "too long";
return 100; // replaced!
};
int percent = 0; // best matching percentage.
char temp[256]; // copy potential bad string and convert to new alphabet
char match[256]; // translated result
strcpy(temp,bad.c_str());
bool uppercase = false;
if ( *temp >= 'A' && *temp <= 'Z' ) uppercase = true;
strlwr(temp);
mTranslate->TranslateIn((unsigned char *)temp); // change alphabet
StringVector::const_iterator i = mWords.begin();
while ( i != mWords.end() )
{
const unsigned char *word = (const unsigned char *)(*i).c_str();
++i;
if ( *word == mWildChar ) // process as a wildcard!
{
if ( !percent && WildCard((const char *)&word[1],temp) )
{
percent = 100; // perfect match to wildcard!!
strcpy(match,(*i).c_str());
}
}
else
{
// Perform fuzzy compare between the two strings.
int alike = Gestalt::FuzzyCompare(temp,(const char *)word);
// Heuristic based on length of word and percentage match
if ( ( size <= 4 && alike > 99 ) ||
( size >= 5 && size < 7 && alike > 85 ) ||
( size >= 7 && size < 10 && alike > 80 ) ||
( size >= 10 && alike > 75 ) )
{
if ( alike > percent ) // if best match so far.
{
strcpy(match,(*i).c_str());
percent = alike;
}
}
}
++i;
}
if ( percent )
{
mTranslate->TranslateOut((unsigned char *)match);
if ( uppercase ) match[0]-=32;
good = match;
}
else
{
good = bad; // nothing wrong with it.
}
return percent;
}
// checks to see if the sequence of characters in 'wild' are to be found anywhere
// inside 'match'
bool CleanLanguage::WildCard(const char *wild,const char *match)
{
int w = strlen(wild);
int m = strlen(match);
while ( w <= m )
{
if ( strncmp(wild,match,w) == 0 ) return true;
match++;
m--;
}
return false;
}
|