A spell Corrector in perl6 part 3
(See part 1 and part 2 for explanations of the following code).
Now, this code should work. If I copy and paste it into pugs it works as expected. If I run it it has some failures in the final tests for correct() that I can't explain.
The part in the beinning shows how to count different words in a file, it depends on proper handling of unicode in pugs, so it may or may not work at the moment,
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
sub words($file) { slurp($file).lc.comb(/<alpha>+/) }
sub train(@words) {
my %res;
for @words -> $w { %res{$w}++ }
%res
}
#my %NWORDS = train(words('/home/rff/Desktop/big.txt'));
my %NWORDS={'ciao'=>4,'c'=>3,'cibo'=>1,'ciaao'=>1,'ccc'=>1,'cia'=>1};
my @ALPHA = 'a'..'z';
# 'abc' -> 'ac'
sub deletion($word) {
(^$word.chars).map: {substr(my $tmp = $word,$_,1)='';$tmp};
}
# 'abc' -> 'adc'
sub substitution($word) {
gather {
for (0..$word.chars-1) X @ALPHA {
substr(my $tmp = $word,$_[0],1)=$_[1];
take $tmp;
}
}
}
# 'abc' -> 'abbc'
sub insertion($word) {
gather {
for (0..$word.chars) X @ALPHA {
substr(my $tmp = $word,$_[0],0)=$_[1];
take $tmp;
}
}
}
# 'abc' -> 'acb'
sub transposition($w) {
gather for ^$w.chars {
my $tmp=$w;
my $removed =(substr($tmp,$_,1)='');
substr($tmp,$_+1,0)=$removed;
take $tmp;
}
}
sub edits1($w) {
# all these are different, no need to use a set
transposition($w),insertion($w),substitution($w),deletion($w)
}
sub known_edits2($words) {
my @ary = gather {
for edits1($words) -> $e1 {
for edits1($e1) -> $e2 {
take $e2 if %NWORDS{$e2}
}
}
}
any(@ary).values
}
sub known(@words) {
gather for @words {take $_ if %NWORDS{$_}} ;
}
sub correct($w) {
my @values = known([$w]) or known(edits1($w)) or known_edits2($w) or [$w];
# single argument max() doesn't work yet
say @values.perl;
@values.max: {%NWORDS{$^a} <=> %NWORDS{$^b}}
}
|
Sorry, comments are closed for this article.