[geeklog-cvs] geeklog-2/lib PHPSpellChecker.class.php,NONE,1.1

tony at geeklog.net tony at geeklog.net
Mon Dec 8 10:47:36 EST 2003


Update of /usr/cvs/geeklog/geeklog-2/lib
In directory geeklog_prod:/tmp/cvs-serv7023

Added Files:
	PHPSpellChecker.class.php 
Log Message:
Initial Release

--- NEW FILE: PHPSpellChecker.class.php ---
<?php

/**
* PHPSpellChecker - A Simple Spellchecking Class in PHP
*
* This source file is subject to version 2.02 of the PHP license,
* that is bundled with this package in the file LICENSE, and is
* available at through the world-wide-web at
* http://www.php.net/license/2_02.txt.
* If you did not receive a copy of the PHP license and are unable to
* obtain it through the world-wide-web, please send a note to
* license at php.net so we can mail you a copy immediately.
*
* @author Tony Bibbs <tony at geeklog.net>
* @copyright Tony Bibbs 2003
* @package net.geeklog.utilities
* @version $Id: PHPSpellChecker.class.php,v 1.1 2003/12/08 15:47:33 tony Exp $
* @todo Need to test the personal dictionary stuff.  Haven't even tried to yet.
*
*/

/**
* Used to indicate the server is a posix-compliant one (e.g. linux/unix)
* @const SPELL_OS_POSIX
*/
define('SPELL_OS_POSIX',1);

/**
* Used to indicate the server uses a version of windows (e.g. 98/XP)
* @const SPELL_OS_POSIX
*/
define('SPELL_OS_WINDOWS',2);

/**
* PHPSpellCheck, a simple spellchecking class
*
* This class makes it easy to implement spellchecking into your PHP applications
* with minimal effort.  Requirements for this class for Posix hosts (Unix/Linux) is
* an installed aspell system.  On my RedHat 9 system aspell was installed by default
* which should be the case for most of you. For Windows users, you should obtain the
* Aspell setup program and the corresponding dictionary file as none is installed by
* default.  Aspell for Posix and Windows can be downloaded from @link http://aspell.net
*
* Here is a most basic example of how to use this class:
* <code>
* <?php
* require_once 'PHPSpellChecker.class.php';
* $spellChecker = new PHPSpellChecker();
* $spellChecker->setPathToAspellBinary('/usr/bin/aspell');
* $spellChecker->setTempDir('C:/Windows/Temp');
* echo $spellChecker->checkWords('This is a test, I srue hope therre are no mispelings');
* ?>
* </code>
*
* @author Tony Bibbs <tony at geeklog.net>
* 
*/
class PHPSpellChecker {
    /**
	* Holds OS type that Aspell is installed on.
	*
	* When calling the Aspell binary we will need to know what OS is being
	* used.  This could be potentially used for future enhancements as well.
	*
	* @see PHPSpellChecker()
	* @see checkWords()
	* @var int
	* @access private
	*
	*/
    var $_operatingSystem = null;
    
    /**
    * Holds path to Aspell binary
    *
    * This holds the fully qualified path to the actual Aspell binary.  Please
    * specifiy the path to the executable file, not to a directory.  NOTE: windows
    * users may find that aspell is picky about the format of the path.  For example,
    * if you have aspell located at C:|Program Files\Aspell\bin\aspell then this value
    * will need to be C:\progra~1\Aspell\bin\aspell
    *
    * @see checkWords()
    * @var string
    * @access private
    *
    */
    var $_pathToAspellBinary = null;
	
	/**
	* Code for language to spellcheck against
    *
    * As expected, this class can spellcheck against a variety of supported languages.  NOTE:
    * you will need to ensure that your respective language dictionaries have been installed
    * otherwise this won't work
    *
    * @see setLanguage()
    * @var string
    * @access private
    * 
	*/
	var $_languageCode = null;
    
    /**
    * Holds any additional Aspell options that aren't supported by this
    * class with a setter and/or getter method.
    *
    * Aspell has a number of options available.  To support those yet keep this
    * class relatively simple, we implement ways to set the most common attributes
    * and allow you to specify additional options that we can include for you when
    * the call to the Aspell binary is made.  Please read the documentation for the
    * constructor as it will explain this better
    *
    * @see PHPSpellChecker()
    * @var array
    * @access private
    *
    */
    var $_aspellOptions = null;
    
    /**
    * This holds the path to the dictionary file we should use
    *
    * Please specify the absolute path to the dictionary file
    *
    * @see setPersonalDictionary()
    * @see checkwords()
    * @var string
    * @access private
    *
    */
    var $_pathToPersonalDictionary = null;
    
    /**
    * This holds the path to teh personal wordlist file we should use
    *
    * Please specify the absolute path to the word list file
    *
    * @see setPersonalWordList()
    * @var string
    * @access private
    *
    */
    var $_pathToPersonalWordList = false;
	
	/**
	* Sets the minimum size of a legitimate word
    *
    * Setting this allows applications to specify how big a word has
    * to be before it is really considered a real word.  By default we set this
    * to 3, but you can override this
    *
    * @see checkwords
    * @var int
    * @access private
    *
	*/
	var $_skipLength = null;
	
	/**
	* Name of temp dir
    *
    * This class puts the text to spell into a temporary file before checking the spelling.  This
    * attribute simply gives us a directory where we can create these temporary files (e.g.
    * /tmp)
    *
    * @see _createTempFile()
    * @var string
    * @access private
    * 
	*/
    var $_tempDir = null;
	
	/**
	* Array of the works that were misspelled
    *
    * During the spellchecking process we keep track of all mispelled words in this
    * array.  This values in this array are used to index into the $_suggestions variable
    * below.  Array format is $myArray[<key>] = '<misspelled word>'
    *
    * @see checkWords()
    * @var array
    * @access private
    * 
	*/
	var $_misspellings = null;
	
	/**
	* Holds suggestions for misspelled words
    *
    * This array holds the suggested spellings for all misspelled words.  The format of this
    * array is $myArray[<misspelled word>] = '<suggestion1>,<suggestion2>'
    *
    * @see checkWords()
    * @var array
    * @access private
    * 
	*/
    var $_suggestions = null;
	
	/**
	* Holds the original text we were to spellcheck against
    *
    * This simply holds the original text to spellcheck against.  It
    * never gets modified
    *
    * @see checkWords()
    * @var string
    * @access private
    *
	*/
	var $_origText = null;
	
	/**
	* Formating to use to begin highlighting a misspelled word
    *
    * The format string is pretty basic, by default we use a div tag
    * that allows the mispellings to be show by using the title attribute.
    * Doing so will show suggestions by simply hovering over the mispelled words.
    * If you want to use a similar format, be sure to use the special tag {suggested}
    * which will get automatically parsed.
    *
    * @see PHPSpellChecker();
    * @see checkWords();
    * @see setHighlight();
    * @var string
    * @access private
    *
	*/
	var $_beginHighlight = null;
	
	/**
	* Formatting to use to end highlighting a misspelled word
    *
    * This is similar to $_beginHighlight above.  This simply ends the highlight
    *
    * @see PHPSpellChecker();
    * @see checkWords();
    * @see setHighlight();
    * @var string
    * @access private
    * 
	*/
    var $_endHighlight = null;
    
    /**
    * Constructor
    *
    * Constructor, does the necessary pspell initialization.  When setting aspell options, be sure
    * to use the option names as shown by calling aspell with no options from your unix or dos shell.
    * note that some of the aspell options can be overridden by some of the methods in this class.  Those
    * methods are:
    * - setMinimumWordList(), since this is explicitly called in the constructor it will effectively ignore
    *   anything you put for aspell's -W or --ignore options
    * - setPersonalWordList(), this isn't called explicitly so you can use the  -p,--personal options or call
    *   that function instead.
    * - 
    * 
    * @author Tony Bibbs <tony at geeklog.net>
    * @access public
    * @param array $aspellOptions Options for aspell in formate myArray['aspellOption'] => 'aspellValue'
    *
    */
    function PHPSpellChecker($aspellOptions = '')
    {
        // Right now this is only used when determining how to call aspell but I'm
        // adding this in case there are other OS specific differences added in the
        // future.
        if (substr(PHP_OS, 0, 3) == 'WIN') {
            $this->_operatingSystem = SPELL_OS_WINDOWS;
        } else {
            $this->_operatingSystem = SPELL_OS_POSIX;
        }
        if (empty($aspellOptions)) {
            $this->_aspellOptions = array();
        } else {
            $this->_aspellOptions = $aspellOptions;
        }
        $this->_misspellings = array();
        $this->_suggestings = array();
        $this->_positions = array();
		$this->setLanguage();
        $this->setMinimumWordLimit();
        $this->setHighlight('<span title="{suggested}" style="background-color: #fdd905";>','</span>'); 
    }
    
	/**
	* Sets the language that should be used.
    *
    * Allows applications to set the language to spellcheck against.  We
    * assume US English by default
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access public
    * @param string $languageCode Code of language to use
    *
	*/
	function setLanguage($languageCode = 'en_US')
	{
		$this->_languageCode = $languageCode;
	}
	
	/**
	* Sets the temporary directory for Aspell to use
    *
    * This class pipes the text to check out of a temporary file.  This
    * method will set the location where we put those temporary files.
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access public
    * @param string $absolutePath Path to the temporary directory
    * 
	*/
	function setTempDir($absolutePath = '')
	{
		$this->_tempDir = $absolutePath;
	}
	
	/**
	* Sets the location to the Aspell binary
    *
    * This sets the path to the actual aspell binary.  Note that
    * this must be an absolute path, don't depend on your
    * operating system's path variables.
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access public
    * @param string $absoluteFileName Location of aspell binary
    *
	*/
	function setPathToApsellBinary($absoluteFileName)
    {
        $this->_pathToAspellBinary = $absoluteFileName;
    }
    
	/**
	* Sets the minimum size of what a 'word' is defined as.
    *
    * Aspell can be configured to ignore words below a certain
    * length and this is facilitate in this class using this
    * method.  For example if you set the minimum size to 3
    * then any word less than 3 characters long.
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access public
    * @param int $minWordLength Minimum length of a word
    *
	*/
	function setMinimumWordlimit($minWordLength = 3)
    {
        $this->_skipLength = $minWordLength;
    }

	/**
	* Sets the location of the personal dictionary to use
    *
    * Aspell allows for any number of dictionary files.  This can
    * be handy when an application might want each of it's users to
    * have their own dictionary
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access public
    * @param string $absoluteFileName Exact location of dictionary to use
    *
	*/
	function setPersonalDictionary($absoluteFileName)
    {
        $this->_pathToPersonalWordList($absoluteFileName);
    }

    /**
    * Sets the file where the current user's personal pspell dictionary can be found. Files
    * should be of the .pws extension and must be in a directory your webserver to write files
    * to.  This feature is optional and disabled by default.  If the given file doesn't exist,
    * this function will attempt to create it.
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access public
    * @param string $absoluteFileName The fully qualified location of the .pws file to use
    *
    */
    function setPersonalWordList($absoluteFileName)
    {
        $this->_pathToPersonalWordList($absoluteFileName);
    }
    
    /**
	* Creates a temp file for use during the spellcheck process
    *
    * Text that is being spellchecked gets loaded into temporary file and
    * then it gets piped over to aspell for checking.  This method creates that
    * temporary file
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access private
    * @return boolean True if successful otherwise false
    *
	*/
	function _createTempFile()
    {
        $this->_tempFile = tempnam($this->_tempDir, 'textToCheck');
      
        if ($this->_tempFile == false) {
            trigger_error('Unable to write the temporary file.');
        } else {
            if (!empty($this->_origText)) {
                if ($fd = fopen($this->_tempFile,'w')) {
					// Chop it up by line.
                    $textarray = explode("\n",$this->_origText);

                    fwrite($fd,"!\n");

                    foreach ($textarray as $key=>$value) {
                        $value = ereg_replace('\\"','"',$value);
                        fwrite($fd,"^$value\n");
                    }

                    fclose($fd);
					
                    return true;
                } else {
					trigger_error('Unable to open the temporary file.');

                    return false;
                }
            } else {
                trigger_error('No text has been set.');

                return false;
            }
        }
    }
    
    /**
    * Adds a word to a user's personal dictionary.  Requires you to call
    * setPersonalWordList() first
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access public
    * @param string $word Word to add to personal dictionary
    *
    */
    function addWord($word)
    {
        /*if ($this->_usePersonalDicts) {
            return pspell_add_to_personal($this->_pspellCfgHandle, $word);
		}*/
	}
    
	/**
	* Parse the return output from aspell
    *
    * Aspell's return values aren't the easiest to work with
    * so this method puts the data into easy to use structures
    * that can be used later by this class and, if needed, by
    * the calling application
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access private
    * @param string $aspellReturn Return output from aspell
    *
	*/
	function _parseReturn($aspellReturn)
	{
    	//next parse $return and $text line by line, eh?
    	$returnarray = explode("\n",$aspellReturn);
    	$returnlines = count($returnarray);

		$textarray = explode("\n",$this->text);

    	$lineindex = -1;
    	$poscorrect = 0;
    	$counter = 0;

    	foreach($returnarray as $line) {
    	    $correction = explode(' ',$line);
    		$word = $correction[1];
    		
    		// If there is a correction here, processes it, else move the $textarray pointer to the next line
			if (substr($line,0,1) == '&') {
    			// Build the position, comma separated.  Line, character
    			$absposition = substr($correction[3],0,-1)-1;
    			$position = $absposition + $poscorrect;
    			$niceposition = $lineindex . "," . $absposition;

				// Start building the suggestion list
    			$suggstart = strpos($line,":")+2;
    			$suggestions = substr($line,$suggstart);
    			$suggestionarray = explode(', ',$suggestions);

				// Set an array for the suggestions.
				$suggestions = array();

    			foreach ($suggestionarray as $aSuggestion) {
					$suggestions[] = $aSuggestion;
    			}
                $this->_misspellings[] = $word;
    		} else {
    		    if (substr($line,0,1) == '#') {
                    // Find the position.
                    $absposition = $correction[2] - 1;
                    $position = $absposition + $poscorrect;
                    $niceposition = $lineindex .",". $absposition;
    
                    // Add this word to the arrays.
                    $thePositions[] = array($word => $niceposition);
                    $theSuggestions[] = array($word => 'No Suggestions.');
					$this->_misspellings[] = $word;
                } else {
                    $poscorrect = 0;
                    $lineindex = $lineindex + 1;
                }
    		}
    		if (count($suggestions) > 0) {
                $this->_suggestions[$word] = implode(',',$suggestions);
    		} else {
    		    // Should not make this string english dependent. For future we should
    		    // make this string configurable so it can be overridden by calling
    		    // applications language preference.
                $this->_suggestions[$word] = 'No suggestions';
    		}
		}
	}
	
    /**
    * Spell checks a set of words by iterating through words and calling check().
    * During the check, it collects the suggested spellings of misspelled words.  If there
    * are mispelled words, _highlightedText is set to $textString and highlights all misspelled words
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access public
    * @param string $textString
    * @param boolean $isHTMLEmbedded Indicates if the text to check has HTML in it
    * @return string $textString with misspelled words highlighted, otherwise true
    *
    */
    function checkWords($textString, $isHTMLEmbedded = false)
    {
		$this->_origText = $textString;
        
        if ($this->_createTempFile() != false) {
            if ($isHTMLEmbedded == true) {
				// NOTE, it appears this doesn't work so well with HREF's (or at all).  Most
				// other tags seem ok
				$addOptions .= ' -H';
            }
            if (!empty($this->_dictionary)) {
                $addOptions .= ' -d ' . $this->dictionary;
            }
            if (!empty($this->wordList)) {
                $addOptions .= ' -p ' . $this->wordList;
            }
            
            // Grab any other aspell options
            for ($i = 1; $i <= count($this->_aspellOptions); $i++) {
                $addOptions .= sprintf(' %s %s', key($this->_aspellOptions), current($this->_aspellOptions));
                next($this->_aspellOptions);
            }
            reset($this->_aspellOptions);
            
            // Calling aspell differs slightly by OS
            if ($this->_operatingSystem == SPELL_OS_WINDOWS) {
                $command = sprintf('%s -a --lang=%s %s < "%s" 2>&1',$this->_pathToAspellBinary, $this->_languageCode, $addOptions, $this->_tempFile);
            } else {
                $command = sprintf('%s -a --lang=%s %s < %s',$this->_pathToAspellBinary, $this->_languageCode, $addOptions, $this->_tempFile);
            }
			
            // Execute the command.
            $retValue = shell_exec($command);
			
            // Delete the temporary file created.
            if (unlink($this->_tempFile) == false) {
                trigger_error('Unable to delete temporary file.');
            }

            // Check to see if the execute statement returned anything.
            if (empty($retValue)) {
                    trigger_error('There was a problem in executing the command against the Aspell binary.');
            } else {
                $this->_parseReturn($retValue);
            }
        } else {
            trigger_error('Unable to create the temporary file.');
        }
        
        if (count($this->_misspellings) > 0) {
			return $this->highlightBadWords($isHTMLEmbedded);
        }
    }
    
    /**
    * Allows ability to set a custom method for highlight misspelled words
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access public
    * @param string $beginHighlight Opening format of a highlighted word
    * @param string $endHighlight Closing format of a highlighted word
    *
    */
    function setHighlight($beginHighlight, $endHighlight)
    {
        $this->_beginHighlight = $beginHighlight;
        $this->_endHighlight = $endHighlight;
    }
    
    /**
    * Highlights misspelled words and returns the text
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access public
    * @return original search text with misspellings highlighted.
    *
    */
    function highlightBadWords($isHTMLEmbedded = false)
    {
		$tmpString = $this->_origText;
        foreach ($this->_misspellings AS $curWord) {
			$tmpBeginHighlight = str_replace('{suggested}', $this->_suggestions[$curWord], $this->_beginHighlight);
			if ($isHTMLEmbedded == true) {
				$tmpString = $this->_getHighlightHTML($tmpBeginHighlight, $curWord, $tmpString);
			} else {
				// Be sure to operate on original text in case there was some HTML/XML in it
				//print_r($this->_misspellings);
				$tmpString = $this->_getHighlight($tmpBeginHighlight, $curWord, $tmpString);
			}
        }
        return $tmpString;
	}

	/**
	* Highlights text
    *
    * This function highlights text in a given string.  This is not meant
    * for strings with embedded HTML
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access private
    * @param string $beginHighlight How to begin highlight text. NOTE that the _endHighlight property
    * @param string $needle Text to highlight
    * @param string $haystack Text to search
    * @see _getHighlightHTML()
    *
	*/
	function _getHighlight($beginHighlight, $needle, $haystack)
	{
		return preg_replace('|\b('.quotemeta($needle).')\b|iU', $beginHighlight .'\\1'.$this->_endHighlight, $haystack);
	}
	
	/**
	* Highlights some text that is embedded within HTML
    *
    * This function highlights misspelled words inside of HTML.  Apparently
    * there is a bug or something with aspell's SGML mode (-H option) that prevents
    * text within a <a href=""> and </a> tag from being checked. That is a limitation
    * of aspell not this method.  If anybody has a work around I'd be interested
    *
    * @author Tony Bibbs <tony at geeklog.net>
    * @access private
    * @param string $beginHighlight How to begin highlight text. NOTE that the _endHighlight property
    * is used to close this one.
    * @param string $needle Text to highlight
    * @param string $haystack Text to search
    * @see _getHighlight()
    *
	*/
	function _getHighlightHTML($beginHighlight, $needle, $haystack)
	{
		$tmp = preg_replace('/([\.\*\+\(\)\[\]])/','\\\\\1',$needle);
		$haystack = preg_replace('/(<)([^>]*)('.("$tmp").')([^<]*)(>)/sei',"'\\1'.preg_replace('/'.(\"$tmp\").'/i','###','\\2\\3\\4').'\\5'",stripslashes($haystack));
		$haystack = preg_replace('/(\W)('.$tmp.')(\W)/si','\\1'.$beginHighlight.'\\2'.$this->_endHighlight.'\\3',stripslashes(' '.$haystack.' '));
		$haystack = substr($haystack,1,strlen($haystack)-2);
		$haystack = preg_replace('/###/si',$needle,$haystack);
		$escapedEnd = str_replace('/','\/',$this->_endHighlight);
		$string = preg_replace('/('.$beginHighlight.')([^<]*)('.$beginHighlight.')([^<]+)('.$escapedEnd.')([^<]*)('.$escapedEnd.')/si','\\1\\2\\4\\6\\7',$haystack);
		return $haystack;
	}
}

?>




More information about the geeklog-cvs mailing list