<?php
set_error_handler ('php_error_handler');
error_reporting (E_ALL | E_STRICT); ini_set ('display_errors', 1);
date_default_timezone_set ('Europe/Moscow');

/* * */

class WordScanner extends AbstractScanner {
    private $path;
    private $ext_1;
    private $ext_2;
    private $source_charset;
    private $target_charset;
    private $min_word_length;
    private $keep_latin_characters;
    private $keep_numbers;
    private $collected;

    public function __construct ($dirname) {
        $this->path = strtr (dirname (dirname ($dirname)), '\\', '/');

        $this->ext_1                 = '.ipf';
        $this->ext_2                 = '.tx_';

        $this->source_charset        = 'd';
        $this->target_charset        = 'w';

        $this->min_word_length       = 3;
        $this->keep_latin_characters = 0;
        $this->keep_numbers          = 0;

        $this->collected = array ();
        $this->collected['words'] = array ();
        $this->collected['phrases'] = array ();

        parent::__construct ();
    }

    /* * */

    public function getWordStatistics () {
        print "\n".'<html><body bgcolor=black><textarea style="color:orange;background:black">'."\n\n";

        print "\n".'-+-'."\n\n";

        $files = $this->_scanTree ($this->path, $this->ext_1, $this->ext_2); 
        if (!count ($files)) { print 'No files.'; exit; }

        print "\n".'-+-'."\n\n"; 

        $this->_printReport ($this->_calcStatistics ($files));
    }

    /* * */

    private function _calcStatistics ($files) {
        for ($file_cntr = 0; $file_cntr < count ($files); $file_cntr ++) {
             $file_text = $this->_readText ($files[$file_cntr]);

             /* * */

             $words = array ();
             $all_words = explode (' ', $file_text);

             for ($word_cntr = 0; $word_cntr < count ($all_words); $word_cntr ++) {
                  $word = $all_words [$word_cntr];

                  if (strlen ($word) <= $this->min_word_length) continue;

                  if (!$this->keep_numbers) {
                      $number = (int) $word; if ($number) continue;
                      if (!trim (strtr ($word, '0', ' '))) continue;
                  }

                  $words[] = $word;
             }

             unset ($all_words);

             /* * */

             for ($word_cntr = 0; $word_cntr < count ($words); $word_cntr ++) {
                  $word = $words [$word_cntr];

                  if (!isset ($this->collected['words'][$word])) $this->collected['words'][$word] = 0;
                  $this->collected['words'][$word] ++;
             }

             /* * */

             for ($word_cntr = 0; $word_cntr < count ($words) - 1; $word_cntr ++) {
                  $word_1 = $words [$word_cntr + 0];
                  $word_2 = $words [$word_cntr + 1];

                  $phrase = $word_1.' '.$word_2;

                  if (!isset ($this->collected['phrases'][$phrase])) $this->collected['phrases'][$phrase] = 0;
                  $this->collected['phrases'][$phrase] ++;
             }

             /* * */

             yield_time ();
        }

        /* * */

        $report = array (
                          'words'   => array (),
                          'phrases' => array ()
                        );

        $word_report = $this->_processWordSet ('words');
        $report['words']['hit_parade'] = $word_report['hit_parade'];
        $report['words']['max_value']  = $word_report['max_value'];
        $report['words']['average_1']  = $word_report['average_1'];
        $report['words']['average_2']  = $word_report['average_2'];
        unset ($word_report);

        $phrase_report = $this->_processWordSet ('phrases');
        $report['phrases']['hit_parade'] = $phrase_report['hit_parade'];
        $report['phrases']['max_value']  = $phrase_report['max_value'];
        $report['phrases']['average_1']  = $phrase_report['average_1'];
        $report['phrases']['average_2']  = $phrase_report['average_2'];
        unset ($phrase_report);

        return $report;
    }

    private function _processWordSet ($selector) {
        $max_frequency = 0;

        foreach ($this->collected[$selector] as $word=>$freq) {
            if ($freq > $max_frequency) $max_frequency = $freq;
        }

        /* * */

        $avg_1_frequency = 0; $avg_1_frequency_cntr = 0;

        foreach ($this->collected[$selector] as $word=>$freq) {
            $avg_1_frequency += $freq; $avg_1_frequency_cntr ++;
        }

        $avg_1_frequency = ($avg_1_frequency_cntr > 0) ? round ($avg_1_frequency / $avg_1_frequency_cntr) : 0;

        /* * */

        $avg_2_frequency = 0; $avg_2_frequency_cntr = 0;
        $average_2_threshold = $avg_1_frequency;

        foreach ($this->collected[$selector] as $word=>$freq) {
            if ($freq > $average_2_threshold) { 
                $avg_2_frequency += $freq; $avg_2_frequency_cntr ++;
            }
        }

        $avg_2_frequency = ($avg_2_frequency_cntr > 0) ? round ($avg_2_frequency / $avg_2_frequency_cntr) : 0;

        /* * */

        $hit_parade = array ();

        foreach ($this->collected[$selector] as $word=>$freq) {
            $hit_parade[] = $this->_addLeadingSpaces ($freq, strlen ($max_frequency))."\t".$word;
        }

        sort ($hit_parade);

        /* * */

        yield_time ();

        /* * */

        return array (
                      'hit_parade' => $hit_parade,
                      'max_value'  => $max_frequency,
                      'average_1'  => $avg_1_frequency,
                      'average_2'  => $avg_2_frequency
                     );
    }

    private function _readText ($file_name) {
        $file = fopen ($file_name, 'r');
        $text = fread ($file, filesize ($file_name));
        fclose ($file); unset ($file);

        /* * */

        $text = convert_cyr_string ($text, $this->source_charset, 'w'); 

        $text = strtr ($text, 
                             'ABCDEFGHIJKLMNOPQRSTUVWXYZŨ',
                             'abcdefghijklmnopqrstuvwxyz');

        $text = convert_cyr_string ($text, 'w', $this->target_charset); 

        /* * */

        $text = strtr ($text, 
                             '~`!@#$%^&*()-_=+[]{};:"|,.<>/?'."'".'\\',
                             '                              '.' '.' ' );

        $text = str_replace ("\b", ' ', $text);
        $text = str_replace ("\t", ' ', $text);
        $text = str_replace ("\n", ' ', $text);
        $text = str_replace ("\v", ' ', $text);
        $text = str_replace ("\f", ' ', $text);
        $text = str_replace ("\r", ' ', $text);

        /* * */

        if ($this->keep_latin_characters) return $text;

        $text = strtr ($text, 
                             'abcdefghijklmnopqrstuvwxyz',
                             '                             ');

        return $text;
    }    

    /* * */

    private function _printReport ($report) {
        for ($step = 0; $step < 2; $step ++) {
             $report_page = '';
             if ($step == 0) $report_page = 'words';
             if ($step == 1) $report_page = 'phrases';

             $hit_parade = $report[$report_page]['hit_parade'];
             $max_value  = $report[$report_page]['max_value'];
             $average_1  = $report[$report_page]['average_1'];
             $average_2  = $report[$report_page]['average_2'];

             $is_under_average_1 = 0; $is_under_average_2 = 0;

             for ($cntr = count ($hit_parade) - 1; $cntr >= 0; $cntr --) {
                  $string = $hit_parade[$cntr];

                  if (!$is_under_average_1 || !$is_under_average_2) {
                      $parts = explode ("\t", $string);
                      $number = (int) trim ($parts[0]);

                      if (!$is_under_average_1 && $number < $average_1) {
                          print $this->_addLeadingSpaces ($average_1, strlen ($max_value))."\t".'--- [AVG.1] ---'."\n";
                          $is_under_average_1 = 1; 
                      }

                      if (!$is_under_average_2 && $number < $average_2) {
                          print $this->_addLeadingSpaces ($average_2, strlen ($max_value))."\t".'--- [AVG.2] ---'."\n";
                          $is_under_average_2 = 1; 
                      }
                  }

                  print $string."\n";
             }

             if (!$is_under_average_1 && $average_1 <= 1) {
                 print $this->_addLeadingSpaces ($average_1, strlen ($max_value))."\t".'--- [AVG.1] ---'."\n";
             }

             if (!$is_under_average_2 && $average_2 <= 1) {
                 print $this->_addLeadingSpaces ($average_2, strlen ($max_value))."\t".'--- [AVG.2] ---'."\n";
             }

             print "\n"; if ($step == 0) { print '-+-'; print "\n\n"; }             
        }
    }

    /* * */

    private function _addLeadingSpaces ($number, $required_length) {
        $spaces = '';
        for ($cntr = 0; $cntr < $required_length - strlen ($number); $cntr ++) $spaces .= ' ';

        $result = (string) ($spaces.$number);
        return $result;
    }
}

class AbstractScanner {
    public function __construct () {}

    /* * */

    protected function _scanTree ($path, $mask_1, $mask_2 = '', $mask_3 = '')  {
        $result = array ();

        $path = strtr ($path, '\\', '/');
        $files = glob ($path.'/*'); $length = count ($files);

        for ($cntr = 0; $cntr < $length; $cntr ++) {
             $file_name = $files[$cntr];

             if (is_dir ($file_name)) { 
                 print $file_name."\n";

                 $next_step_files = $this->_scanTree ($file_name, $mask_1, $mask_2, $mask_3);
                 foreach ($next_step_files as $f) { $result[] = $f; }
             } else {
                 if ($mask_1 && strpos (' '.$file_name, $mask_1)) { $result[] = $file_name; continue; }
                 if ($mask_2 && strpos (' '.$file_name, $mask_2)) { $result[] = $file_name; continue; }
                 if ($mask_3 && strpos (' '.$file_name, $mask_3)) { $result[] = $file_name; continue; }
             }
        }

        return $result;
    }
}

$scanner = new WordScanner ((isset ($dirname)) ? $dirname : dirname (__FILE__));
$scanner->getWordStatistics ();
unset ($scanner);

/* * */

function strreplace ($haystack, $needle, $str) { return str_replace ($needle, $str, $haystack); }
function yield_time () { if (function_exists ('proc_nice')) { proc_nice (abs (31)); } else { usleep (25); } }

function php_error_handler ($errno, $errstr, $errfile, $errline, $vars) { 
    die ("\n\n".'['.$errno.'] '.$errstr.' ('.strtr ($errfile, '\\', '/').', '.$errline.')'."\n\n");
}