[geeklog-cvs] geeklog-2/lib PHPSpellChecker.class.php,NONE,1.1
tony at geeklog.net
tony at geeklog.net
Mon Dec 8 10:47:36 EST 2003
Update of /usr/cvs/geeklog/geeklog-2/lib
In directory geeklog_prod:/tmp/cvs-serv7023
Added Files:
PHPSpellChecker.class.php
Log Message:
Initial Release
--- NEW FILE: PHPSpellChecker.class.php ---
<?php
/**
* PHPSpellChecker - A Simple Spellchecking Class in PHP
*
* This source file is subject to version 2.02 of the PHP license,
* that is bundled with this package in the file LICENSE, and is
* available at through the world-wide-web at
* http://www.php.net/license/2_02.txt.
* If you did not receive a copy of the PHP license and are unable to
* obtain it through the world-wide-web, please send a note to
* license at php.net so we can mail you a copy immediately.
*
* @author Tony Bibbs <tony at geeklog.net>
* @copyright Tony Bibbs 2003
* @package net.geeklog.utilities
* @version $Id: PHPSpellChecker.class.php,v 1.1 2003/12/08 15:47:33 tony Exp $
* @todo Need to test the personal dictionary stuff. Haven't even tried to yet.
*
*/
/**
* Used to indicate the server is a posix-compliant one (e.g. linux/unix)
* @const SPELL_OS_POSIX
*/
define('SPELL_OS_POSIX',1);
/**
* Used to indicate the server uses a version of windows (e.g. 98/XP)
* @const SPELL_OS_POSIX
*/
define('SPELL_OS_WINDOWS',2);
/**
* PHPSpellCheck, a simple spellchecking class
*
* This class makes it easy to implement spellchecking into your PHP applications
* with minimal effort. Requirements for this class for Posix hosts (Unix/Linux) is
* an installed aspell system. On my RedHat 9 system aspell was installed by default
* which should be the case for most of you. For Windows users, you should obtain the
* Aspell setup program and the corresponding dictionary file as none is installed by
* default. Aspell for Posix and Windows can be downloaded from @link http://aspell.net
*
* Here is a most basic example of how to use this class:
* <code>
* <?php
* require_once 'PHPSpellChecker.class.php';
* $spellChecker = new PHPSpellChecker();
* $spellChecker->setPathToAspellBinary('/usr/bin/aspell');
* $spellChecker->setTempDir('C:/Windows/Temp');
* echo $spellChecker->checkWords('This is a test, I srue hope therre are no mispelings');
* ?>
* </code>
*
* @author Tony Bibbs <tony at geeklog.net>
*
*/
class PHPSpellChecker {
/**
* Holds OS type that Aspell is installed on.
*
* When calling the Aspell binary we will need to know what OS is being
* used. This could be potentially used for future enhancements as well.
*
* @see PHPSpellChecker()
* @see checkWords()
* @var int
* @access private
*
*/
var $_operatingSystem = null;
/**
* Holds path to Aspell binary
*
* This holds the fully qualified path to the actual Aspell binary. Please
* specifiy the path to the executable file, not to a directory. NOTE: windows
* users may find that aspell is picky about the format of the path. For example,
* if you have aspell located at C:|Program Files\Aspell\bin\aspell then this value
* will need to be C:\progra~1\Aspell\bin\aspell
*
* @see checkWords()
* @var string
* @access private
*
*/
var $_pathToAspellBinary = null;
/**
* Code for language to spellcheck against
*
* As expected, this class can spellcheck against a variety of supported languages. NOTE:
* you will need to ensure that your respective language dictionaries have been installed
* otherwise this won't work
*
* @see setLanguage()
* @var string
* @access private
*
*/
var $_languageCode = null;
/**
* Holds any additional Aspell options that aren't supported by this
* class with a setter and/or getter method.
*
* Aspell has a number of options available. To support those yet keep this
* class relatively simple, we implement ways to set the most common attributes
* and allow you to specify additional options that we can include for you when
* the call to the Aspell binary is made. Please read the documentation for the
* constructor as it will explain this better
*
* @see PHPSpellChecker()
* @var array
* @access private
*
*/
var $_aspellOptions = null;
/**
* This holds the path to the dictionary file we should use
*
* Please specify the absolute path to the dictionary file
*
* @see setPersonalDictionary()
* @see checkwords()
* @var string
* @access private
*
*/
var $_pathToPersonalDictionary = null;
/**
* This holds the path to teh personal wordlist file we should use
*
* Please specify the absolute path to the word list file
*
* @see setPersonalWordList()
* @var string
* @access private
*
*/
var $_pathToPersonalWordList = false;
/**
* Sets the minimum size of a legitimate word
*
* Setting this allows applications to specify how big a word has
* to be before it is really considered a real word. By default we set this
* to 3, but you can override this
*
* @see checkwords
* @var int
* @access private
*
*/
var $_skipLength = null;
/**
* Name of temp dir
*
* This class puts the text to spell into a temporary file before checking the spelling. This
* attribute simply gives us a directory where we can create these temporary files (e.g.
* /tmp)
*
* @see _createTempFile()
* @var string
* @access private
*
*/
var $_tempDir = null;
/**
* Array of the works that were misspelled
*
* During the spellchecking process we keep track of all mispelled words in this
* array. This values in this array are used to index into the $_suggestions variable
* below. Array format is $myArray[<key>] = '<misspelled word>'
*
* @see checkWords()
* @var array
* @access private
*
*/
var $_misspellings = null;
/**
* Holds suggestions for misspelled words
*
* This array holds the suggested spellings for all misspelled words. The format of this
* array is $myArray[<misspelled word>] = '<suggestion1>,<suggestion2>'
*
* @see checkWords()
* @var array
* @access private
*
*/
var $_suggestions = null;
/**
* Holds the original text we were to spellcheck against
*
* This simply holds the original text to spellcheck against. It
* never gets modified
*
* @see checkWords()
* @var string
* @access private
*
*/
var $_origText = null;
/**
* Formating to use to begin highlighting a misspelled word
*
* The format string is pretty basic, by default we use a div tag
* that allows the mispellings to be show by using the title attribute.
* Doing so will show suggestions by simply hovering over the mispelled words.
* If you want to use a similar format, be sure to use the special tag {suggested}
* which will get automatically parsed.
*
* @see PHPSpellChecker();
* @see checkWords();
* @see setHighlight();
* @var string
* @access private
*
*/
var $_beginHighlight = null;
/**
* Formatting to use to end highlighting a misspelled word
*
* This is similar to $_beginHighlight above. This simply ends the highlight
*
* @see PHPSpellChecker();
* @see checkWords();
* @see setHighlight();
* @var string
* @access private
*
*/
var $_endHighlight = null;
/**
* Constructor
*
* Constructor, does the necessary pspell initialization. When setting aspell options, be sure
* to use the option names as shown by calling aspell with no options from your unix or dos shell.
* note that some of the aspell options can be overridden by some of the methods in this class. Those
* methods are:
* - setMinimumWordList(), since this is explicitly called in the constructor it will effectively ignore
* anything you put for aspell's -W or --ignore options
* - setPersonalWordList(), this isn't called explicitly so you can use the -p,--personal options or call
* that function instead.
* -
*
* @author Tony Bibbs <tony at geeklog.net>
* @access public
* @param array $aspellOptions Options for aspell in formate myArray['aspellOption'] => 'aspellValue'
*
*/
function PHPSpellChecker($aspellOptions = '')
{
// Right now this is only used when determining how to call aspell but I'm
// adding this in case there are other OS specific differences added in the
// future.
if (substr(PHP_OS, 0, 3) == 'WIN') {
$this->_operatingSystem = SPELL_OS_WINDOWS;
} else {
$this->_operatingSystem = SPELL_OS_POSIX;
}
if (empty($aspellOptions)) {
$this->_aspellOptions = array();
} else {
$this->_aspellOptions = $aspellOptions;
}
$this->_misspellings = array();
$this->_suggestings = array();
$this->_positions = array();
$this->setLanguage();
$this->setMinimumWordLimit();
$this->setHighlight('<span title="{suggested}" style="background-color: #fdd905";>','</span>');
}
/**
* Sets the language that should be used.
*
* Allows applications to set the language to spellcheck against. We
* assume US English by default
*
* @author Tony Bibbs <tony at geeklog.net>
* @access public
* @param string $languageCode Code of language to use
*
*/
function setLanguage($languageCode = 'en_US')
{
$this->_languageCode = $languageCode;
}
/**
* Sets the temporary directory for Aspell to use
*
* This class pipes the text to check out of a temporary file. This
* method will set the location where we put those temporary files.
*
* @author Tony Bibbs <tony at geeklog.net>
* @access public
* @param string $absolutePath Path to the temporary directory
*
*/
function setTempDir($absolutePath = '')
{
$this->_tempDir = $absolutePath;
}
/**
* Sets the location to the Aspell binary
*
* This sets the path to the actual aspell binary. Note that
* this must be an absolute path, don't depend on your
* operating system's path variables.
*
* @author Tony Bibbs <tony at geeklog.net>
* @access public
* @param string $absoluteFileName Location of aspell binary
*
*/
function setPathToApsellBinary($absoluteFileName)
{
$this->_pathToAspellBinary = $absoluteFileName;
}
/**
* Sets the minimum size of what a 'word' is defined as.
*
* Aspell can be configured to ignore words below a certain
* length and this is facilitate in this class using this
* method. For example if you set the minimum size to 3
* then any word less than 3 characters long.
*
* @author Tony Bibbs <tony at geeklog.net>
* @access public
* @param int $minWordLength Minimum length of a word
*
*/
function setMinimumWordlimit($minWordLength = 3)
{
$this->_skipLength = $minWordLength;
}
/**
* Sets the location of the personal dictionary to use
*
* Aspell allows for any number of dictionary files. This can
* be handy when an application might want each of it's users to
* have their own dictionary
*
* @author Tony Bibbs <tony at geeklog.net>
* @access public
* @param string $absoluteFileName Exact location of dictionary to use
*
*/
function setPersonalDictionary($absoluteFileName)
{
$this->_pathToPersonalWordList($absoluteFileName);
}
/**
* Sets the file where the current user's personal pspell dictionary can be found. Files
* should be of the .pws extension and must be in a directory your webserver to write files
* to. This feature is optional and disabled by default. If the given file doesn't exist,
* this function will attempt to create it.
*
* @author Tony Bibbs <tony at geeklog.net>
* @access public
* @param string $absoluteFileName The fully qualified location of the .pws file to use
*
*/
function setPersonalWordList($absoluteFileName)
{
$this->_pathToPersonalWordList($absoluteFileName);
}
/**
* Creates a temp file for use during the spellcheck process
*
* Text that is being spellchecked gets loaded into temporary file and
* then it gets piped over to aspell for checking. This method creates that
* temporary file
*
* @author Tony Bibbs <tony at geeklog.net>
* @access private
* @return boolean True if successful otherwise false
*
*/
function _createTempFile()
{
$this->_tempFile = tempnam($this->_tempDir, 'textToCheck');
if ($this->_tempFile == false) {
trigger_error('Unable to write the temporary file.');
} else {
if (!empty($this->_origText)) {
if ($fd = fopen($this->_tempFile,'w')) {
// Chop it up by line.
$textarray = explode("\n",$this->_origText);
fwrite($fd,"!\n");
foreach ($textarray as $key=>$value) {
$value = ereg_replace('\\"','"',$value);
fwrite($fd,"^$value\n");
}
fclose($fd);
return true;
} else {
trigger_error('Unable to open the temporary file.');
return false;
}
} else {
trigger_error('No text has been set.');
return false;
}
}
}
/**
* Adds a word to a user's personal dictionary. Requires you to call
* setPersonalWordList() first
*
* @author Tony Bibbs <tony at geeklog.net>
* @access public
* @param string $word Word to add to personal dictionary
*
*/
function addWord($word)
{
/*if ($this->_usePersonalDicts) {
return pspell_add_to_personal($this->_pspellCfgHandle, $word);
}*/
}
/**
* Parse the return output from aspell
*
* Aspell's return values aren't the easiest to work with
* so this method puts the data into easy to use structures
* that can be used later by this class and, if needed, by
* the calling application
*
* @author Tony Bibbs <tony at geeklog.net>
* @access private
* @param string $aspellReturn Return output from aspell
*
*/
function _parseReturn($aspellReturn)
{
//next parse $return and $text line by line, eh?
$returnarray = explode("\n",$aspellReturn);
$returnlines = count($returnarray);
$textarray = explode("\n",$this->text);
$lineindex = -1;
$poscorrect = 0;
$counter = 0;
foreach($returnarray as $line) {
$correction = explode(' ',$line);
$word = $correction[1];
// If there is a correction here, processes it, else move the $textarray pointer to the next line
if (substr($line,0,1) == '&') {
// Build the position, comma separated. Line, character
$absposition = substr($correction[3],0,-1)-1;
$position = $absposition + $poscorrect;
$niceposition = $lineindex . "," . $absposition;
// Start building the suggestion list
$suggstart = strpos($line,":")+2;
$suggestions = substr($line,$suggstart);
$suggestionarray = explode(', ',$suggestions);
// Set an array for the suggestions.
$suggestions = array();
foreach ($suggestionarray as $aSuggestion) {
$suggestions[] = $aSuggestion;
}
$this->_misspellings[] = $word;
} else {
if (substr($line,0,1) == '#') {
// Find the position.
$absposition = $correction[2] - 1;
$position = $absposition + $poscorrect;
$niceposition = $lineindex .",". $absposition;
// Add this word to the arrays.
$thePositions[] = array($word => $niceposition);
$theSuggestions[] = array($word => 'No Suggestions.');
$this->_misspellings[] = $word;
} else {
$poscorrect = 0;
$lineindex = $lineindex + 1;
}
}
if (count($suggestions) > 0) {
$this->_suggestions[$word] = implode(',',$suggestions);
} else {
// Should not make this string english dependent. For future we should
// make this string configurable so it can be overridden by calling
// applications language preference.
$this->_suggestions[$word] = 'No suggestions';
}
}
}
/**
* Spell checks a set of words by iterating through words and calling check().
* During the check, it collects the suggested spellings of misspelled words. If there
* are mispelled words, _highlightedText is set to $textString and highlights all misspelled words
*
* @author Tony Bibbs <tony at geeklog.net>
* @access public
* @param string $textString
* @param boolean $isHTMLEmbedded Indicates if the text to check has HTML in it
* @return string $textString with misspelled words highlighted, otherwise true
*
*/
function checkWords($textString, $isHTMLEmbedded = false)
{
$this->_origText = $textString;
if ($this->_createTempFile() != false) {
if ($isHTMLEmbedded == true) {
// NOTE, it appears this doesn't work so well with HREF's (or at all). Most
// other tags seem ok
$addOptions .= ' -H';
}
if (!empty($this->_dictionary)) {
$addOptions .= ' -d ' . $this->dictionary;
}
if (!empty($this->wordList)) {
$addOptions .= ' -p ' . $this->wordList;
}
// Grab any other aspell options
for ($i = 1; $i <= count($this->_aspellOptions); $i++) {
$addOptions .= sprintf(' %s %s', key($this->_aspellOptions), current($this->_aspellOptions));
next($this->_aspellOptions);
}
reset($this->_aspellOptions);
// Calling aspell differs slightly by OS
if ($this->_operatingSystem == SPELL_OS_WINDOWS) {
$command = sprintf('%s -a --lang=%s %s < "%s" 2>&1',$this->_pathToAspellBinary, $this->_languageCode, $addOptions, $this->_tempFile);
} else {
$command = sprintf('%s -a --lang=%s %s < %s',$this->_pathToAspellBinary, $this->_languageCode, $addOptions, $this->_tempFile);
}
// Execute the command.
$retValue = shell_exec($command);
// Delete the temporary file created.
if (unlink($this->_tempFile) == false) {
trigger_error('Unable to delete temporary file.');
}
// Check to see if the execute statement returned anything.
if (empty($retValue)) {
trigger_error('There was a problem in executing the command against the Aspell binary.');
} else {
$this->_parseReturn($retValue);
}
} else {
trigger_error('Unable to create the temporary file.');
}
if (count($this->_misspellings) > 0) {
return $this->highlightBadWords($isHTMLEmbedded);
}
}
/**
* Allows ability to set a custom method for highlight misspelled words
*
* @author Tony Bibbs <tony at geeklog.net>
* @access public
* @param string $beginHighlight Opening format of a highlighted word
* @param string $endHighlight Closing format of a highlighted word
*
*/
function setHighlight($beginHighlight, $endHighlight)
{
$this->_beginHighlight = $beginHighlight;
$this->_endHighlight = $endHighlight;
}
/**
* Highlights misspelled words and returns the text
*
* @author Tony Bibbs <tony at geeklog.net>
* @access public
* @return original search text with misspellings highlighted.
*
*/
function highlightBadWords($isHTMLEmbedded = false)
{
$tmpString = $this->_origText;
foreach ($this->_misspellings AS $curWord) {
$tmpBeginHighlight = str_replace('{suggested}', $this->_suggestions[$curWord], $this->_beginHighlight);
if ($isHTMLEmbedded == true) {
$tmpString = $this->_getHighlightHTML($tmpBeginHighlight, $curWord, $tmpString);
} else {
// Be sure to operate on original text in case there was some HTML/XML in it
//print_r($this->_misspellings);
$tmpString = $this->_getHighlight($tmpBeginHighlight, $curWord, $tmpString);
}
}
return $tmpString;
}
/**
* Highlights text
*
* This function highlights text in a given string. This is not meant
* for strings with embedded HTML
*
* @author Tony Bibbs <tony at geeklog.net>
* @access private
* @param string $beginHighlight How to begin highlight text. NOTE that the _endHighlight property
* @param string $needle Text to highlight
* @param string $haystack Text to search
* @see _getHighlightHTML()
*
*/
function _getHighlight($beginHighlight, $needle, $haystack)
{
return preg_replace('|\b('.quotemeta($needle).')\b|iU', $beginHighlight .'\\1'.$this->_endHighlight, $haystack);
}
/**
* Highlights some text that is embedded within HTML
*
* This function highlights misspelled words inside of HTML. Apparently
* there is a bug or something with aspell's SGML mode (-H option) that prevents
* text within a <a href=""> and </a> tag from being checked. That is a limitation
* of aspell not this method. If anybody has a work around I'd be interested
*
* @author Tony Bibbs <tony at geeklog.net>
* @access private
* @param string $beginHighlight How to begin highlight text. NOTE that the _endHighlight property
* is used to close this one.
* @param string $needle Text to highlight
* @param string $haystack Text to search
* @see _getHighlight()
*
*/
function _getHighlightHTML($beginHighlight, $needle, $haystack)
{
$tmp = preg_replace('/([\.\*\+\(\)\[\]])/','\\\\\1',$needle);
$haystack = preg_replace('/(<)([^>]*)('.("$tmp").')([^<]*)(>)/sei',"'\\1'.preg_replace('/'.(\"$tmp\").'/i','###','\\2\\3\\4').'\\5'",stripslashes($haystack));
$haystack = preg_replace('/(\W)('.$tmp.')(\W)/si','\\1'.$beginHighlight.'\\2'.$this->_endHighlight.'\\3',stripslashes(' '.$haystack.' '));
$haystack = substr($haystack,1,strlen($haystack)-2);
$haystack = preg_replace('/###/si',$needle,$haystack);
$escapedEnd = str_replace('/','\/',$this->_endHighlight);
$string = preg_replace('/('.$beginHighlight.')([^<]*)('.$beginHighlight.')([^<]+)('.$escapedEnd.')([^<]*)('.$escapedEnd.')/si','\\1\\2\\4\\6\\7',$haystack);
return $haystack;
}
}
?>
More information about the geeklog-cvs
mailing list