#!/usr/bin/perl # Written by Jon Dehdari 2004-2005 # Perl 5.8 # Perstem: Stemmer and Morphological Parser for Persian # The license is the GPL v.2 (www.fsf.org) use strict; use Getopt::Long; #use diagnostics; my $version = "0.8.0"; my $date = "2005/09/21"; my $copyright = "(c) 2004-2005 Jon Dehdari - GPL v2"; my $title = "Perstem: Persian stemmer $version, $date - $copyright"; my $dont_stem = 0; my $input_type = 0; my $output_type = 0; my $no_roman = 0; my $recall = 0; my $show_links = 0; my $show_only_root = 0; my $tokenize = 0; my $unvowel = 0; my $zwnj = 0; my $ar_chars = "EqHSTDZLVU"; #my $al = "AbptVjcHxdLrzJsCSDTZEGfqkglmnuhiaoe\x5d\x7cPkMXIUN~"; #my $longvowel = "Aui]"; #my @line; #my %resolve; #my @resolve; #my $resolve; #my $resolve_file = "resolve.txt"; my $usage = <<"END_OF_USAGE"; ${title} Syntax: perl $0 [options] < input > output Function: Stemmer and morphological analyzer for the Persian language (Farsi). Inflexional morphemes are separated from their roots. Options: -d, --nostem Don't stem; mostly for character-set conversion -h, --help Print usage -i, --input Input character encoding type {cp1256,isiri3342,utf8,unihtml} -l, --links Show morphological links -n, --noroman Delete all non-Arabic script characters (eg. HTML tags) -o, --output Output character encoding type {arabtex,cp1256,isiri3342,utf8,unihtml} -r, --recall Increase recall by parsing ambiguous affixes -t, --tokenize Tokenize punctuation -u, --unvowel Remove short vowels -v, --version Print version ($version) -w, --root Return only word roots -z, --zwnj Insert Zero Width Non-Joiners where they should be END_OF_USAGE # -s, --stoplist Use stopword list (default: ./resolve.txt) GetOptions( 'd|nostem' => \$dont_stem, 'h|help|?' => sub { print $usage; exit; }, 'i|input:s' => \$input_type, 'l|links' => \$show_links, 'n|noroman' => \$no_roman, 'o|output:s' => \$output_type, 'r|recall' => \$recall, # 's|stoplist:s' => \$resolve_file, 't|tokenize' => \$tokenize, 'u|unvowel' => \$unvowel, 'v|version' => sub { print "$version\n"; exit; }, 'w|root' => \$show_only_root, 'z|zwnj' => \$zwnj, ) or die $usage; $input_type =~ s/.*1256/cp1256/; # equates win1256 with cp1256 $output_type =~ s/.*1256/cp1256/; # equates win1256 with cp1256 $input_type =~ tr/[A-Z]/[a-z]/; # recognizes more enctype spellings $output_type =~ tr/[A-Z]/[a-z]/; # recognizes more enctype spellings $input_type =~ tr/-//; # eg. UTF-8 & utf8 $output_type =~ tr/-//; # eg. UTF-8 & utf8 ### Ignore the following 11 lines ## The format of the resolve.txt file is as follows: ## 1. Mokassar: 'ktb ktAb' OR 'ktb ktAb_+PL' ## 2. Preparsed (speed): 'krdn kr_+dn' ## 3. Don't stem: 'bArAn bArAn' ## 4. Stop word: 'u ' #open RESOLVE, "$resolve_file"; #while ($resolve = ) { # chomp $resolve; # @resolve = split /\t/, $resolve; # %resolve = ( %resolve, "$resolve[0]" => "$resolve[1]" , ); #} ### A hack for what Perl should've already done: support BOTH utf8 & other input types at runtime if ($input_type eq "utf8") { # UTF-8 use encoding "utf8"; open STDIN, "<:encoding(UTF-8)" ; } else { unimport encoding "utf8";} while ($_ = <> ) { chomp $_; $_ =~ tr/\r/\n/d; # Deletes stupid DOS carriage returns ### Converts from native script to romanized if ($input_type) { if ($no_roman) { $_ =~ s/
/\n/g; $_ =~ s/

/\n/g; $_ =~ tr/\x01-\x09\x1b-\x1f\x21-\x2d\x2f-\x5a\x5c\x5e-\x9f//d; # Deletes all chars below xa0 except: 0a,20,2e,5b,5d # $_ =~ s/<\.>//g; # Deletes all dots in HTML tags # $_ =~ s/<.*?>//g; # Deletes all HTML tags on 1 line # $_ =~ s/<.*?//g; # Deleses 1st part of line-spanning HTML tags # $_ =~ s/.*?>//g; # Deletes 2nd part of line-spanning HTML tags } if ($input_type eq "utf8") { $_ =~ tr/ابپتثجچحخدذرزژسشصضطظعغفقكگلمنوهيَُِآ☿ةکیءىۀئؤًّ،؛؟٪‍‌/AbptVjcHxdLrzJsCSDTZEGfqkglmnuhiaoe\x5d\x7cPkiMiXIUN~,;?%*\-/; } elsif ($input_type eq "unihtml") { my %unihtml2roman = ( 'ا' => 'A', '☿' => '|', "ب" => 'b', 'ة' => 'P', 'پ' => 'p', 'ت' => 't', 'ث' => 'V', 'ج' => 'j', 'چ' => 'c', 'ح' => 'H', 'خ' => 'x', 'د' => 'd', 'ذ' => 'L', 'ر' => 'r', 'ز' => 'z', 'ژ' => 'J', 'س' => 's', 'ش' => 'C', 'ص' => 'S', 'ض' => 'D', 'ط' => 'T', 'ظ' => 'Z', 'ع' => 'E', 'غ' => 'G', 'ف' => 'f', 'ق' => 'q', 'ك' => 'k', 'ک' => 'k', 'گ' => 'g', 'ل' => 'l', 'م' => 'm', 'ن' => 'n', 'و' => 'u', 'ه' => 'h', 'ي' => 'i', 'ی' => 'i', 'ى' => 'A', 'َ' => 'a', 'ُ' => 'o', 'ِ' => 'e', 'ّ' => '~', 'آ' => ']', 'ء' => 'M', 'ً' => 'N', 'أ' => '|', 'ؤ' => 'U', 'إ' => '|', 'ئ' => 'I', 'ۀ' => 'X', '٪' => '%', '،' => ',', '؛' => ';', '؟' => '?', '‌' => "-", ' ' => ' ', '.' => '.', ':' => ':', ); my @charx = split(/(?=\&\#)|(?=\s)|(?=\n)/, $_); $_ = ""; foreach my $charx (@charx) { my $text_from_new = $unihtml2roman{$charx}; $_ = $_ . $text_from_new; } # Ends foreach } # Ends elsif ($input_type eq "unihtml") elsif ($input_type eq "cp1256") { $_ =~ tr/\xc7\xc8\x81\xca\xcb\xcc\x8d\xcd\xce\xcf\xd0\xd1\xd2\x8e\xd3\xd4\xd5\xd6\xd8\xd9\xda\xdb\xdd\xde\xdf\x90\xe1\xe3\xe4\xe6\xe5\xed\xf3\xf5\xf6\xc2\xff\xc9\x98\xc1\xc0\xc6\xc4\xf0\xf8\xa1\xba\xbf\xab\xbb\x9d\xec/AbptVjcHxdLrzJsCSDTZEGfqkglmnuhiaoe\x5d\x7cPkMXIUN~,;?{}\-i/; } elsif ($input_type eq "isiri3342") { $_ =~ tr/\xc1\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xfe\xf0\xf2\xf1\xc0\xc1\xfc\xda\xe1\xc2\xfb\xfa\xf3\xf6\xac\xbb\xbf\xa5\xe7\xe6\xa1/AbptVjcHxdLrzJsCSDTZEGfqKglmnuhyaoe\x5d\x7cPkiMIUN~,;?%{}\-/; } $_ =~ s/\bA/|/g; # eg. AirAn -> |irAn } # if ($input_type) if ( $unvowel ) { $_ =~ s/\b([aeo])/|/g; # Inserts alef before words that begin with short vowel $_ =~ s/\bA/]/g; # Changes long 'aa' at beginning of word to alef madda $_ =~ s/[aeo~]//g; # Finally, removes all other short vowels and tashdids } if ( $zwnj ) { #Inserts ZWNJ's where they should have been originally, but weren't $_ =~ s/(? 'ا', '|' => 'ا', 'b' => 'ب', 'p' => 'پ', 't' => 'ت', 'V' => 'ث', 'j' => 'ج', 'c' => 'چ', 'H' => 'ح', 'x' => 'خ', 'd' => 'د', 'L' => 'ذ', 'r' => 'ر', 'z' => 'ز', 'J' => 'ژ', 's' => 'س', 'C' => 'ش', 'S' => 'ص', 'D' => 'ض', 'T' => 'ط', 'Z' => 'ظ', 'E' => 'ع', 'G' => 'غ', 'f' => 'ف', 'q' => 'ق', 'k' => 'ک', 'K' => 'ك', 'g' => 'گ', 'l' => 'ل', 'm' => 'م', 'n' => 'ن', 'u' => 'و', 'v' => 'و', 'w' => 'و', 'h' => 'ه', 'X' => 'ۀ', 'i' => 'ی', 'I' => 'ئ', 'a' => 'َ', 'o' => 'ُ', 'e' => 'ِ', '~' => 'ّ', ',' => '،', ';' => '؛', '?' => '؟', ']' => 'آ', 'M' => 'ء', 'N' => 'ً', 'U' => 'ؤ', '-' => '‌', ' ' => ' ', '_' => '_', '+' => '+', "\n" => '
', '.' => '‫.‪', ); my @charx = split(//, $_); $_ = ""; foreach my $charx (@charx) { my $newchar = $roman2unihtml{$charx}; $_ = $_ . $newchar; } # Ends foreach } # Ends elsif (unihtml) elsif ($output_type eq "cp1256") { $_ =~ tr/AbptVjcHxdLrzJsCSDTZEGfqKglmnuhyaoe\x5d\x7cPkMXIUN~,;?{}\-i/\xc7\xc8\x81\xca\xcb\xcc\x8d\xcd\xce\xcf\xd0\xd1\xd2\x8e\xd3\xd4\xd5\xd6\xd8\xd9\xda\xdb\xdd\xde\xdf\x90\xe1\xe3\xe4\xe6\xe5\xed\xf3\xf5\xf6\xc2\xff\xc9\x98\xc1\xc0\xc6\xc4\xf0\xf8\xa1\xba\xbf\xab\xbb\x9d\xec/; # $_ =~ s/\x2e/\xfe\x2e\xfd/g; # Corrects periods to be RTL embedded; broken } elsif ($output_type eq "isiri3342") { $_ =~ tr/AbptVjcHxdLrzJsCSDTZEGfqKglmnuhyaoe\x5d\x7cPkiMIUN~,;?%{}\-/\xc1\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xfe\xf0\xf2\xf1\xc0\xc1\xfc\xda\xe1\xc2\xfb\xfa\xf3\xf6\xac\xbb\xbf\xa5\xe7\xe6\xa1/; } elsif ($output_type eq "arabtex") { my %roman2arabtex = ( 'A' => 'A', '|' => 'a', 'b' => 'b', 'p' => 'p', 't' => 't', 'V' => '_t', 'j' => 'j', 'c' => '^c', 'H' => '.h', 'x' => 'x', 'd' => 'd', 'L' => '_d', 'r' => 'r', 'z' => 'z', 'J' => '^z', 's' => 's', 'C' => '^s', 'S' => '.s', 'D' => '.d', 'T' => '.t', 'Z' => '.z', 'E' => '`', 'G' => '.g', 'f' => 'f', 'q' => 'q', 'K' => 'k', 'k' => 'k', 'g' => 'g', 'l' => 'l', 'm' => 'm', 'n' => 'n', 'u' => 'U', 'v' => 'w', 'w' => 'w', 'h' => 'h', 'X' => 'H-i', 'i' => 'I', 'I' => '\'y', 'a' => 'a', 'o' => 'o', 'e' => 'e', 'P' => 'T', '~' => '', ',' => ',', ';' => ';', '?' => '?', ']' => '^A', 'M' => '\'', 'N' => 'aN', 'U' => 'U\'', '{' => '\lq ', '}' => '\rq ', '-' => '\hspace{0ex}', '.' => '.', ' ' => ' ', '_' => '_', '+' => '+', ); my @charx = split(//, $_); $_ = ""; foreach my $charx (@charx) { my $newchar = $roman2arabtex{$charx}; $_ = $_ . $newchar; } # Ends foreach # $_ = $_ . '\\\\'; # Appends LaTeX newline '\\' after each line } # Ends elsif (arabtex) if ($output_type eq "utf8" && $_ =~ m/[^ .\n]/) # If utf8 & non-empty { binmode(STDOUT, ":utf8"); # Uses the :utf8 output layer print "$_\n"; } elsif ($_ =~ m/[^ .\n]/) {print "$_\n";} # If arabic-script line is non-empty } # if ($output_type) else { if ($_ =~ m/[^ .\n]/) {print "$_\n";} # If roman-script line is non-empty } #} # ends else } # ends while (<>)