-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclassify.pl
More file actions
121 lines (107 loc) · 2.84 KB
/
classify.pl
File metadata and controls
121 lines (107 loc) · 2.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/perl -w
# Created: 29 April 2014
# By: Ali Reza Ebadat
# The goal is to classify sentences based on their languages
use strict;
use warnings;
use utf8;
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");
sub ngram($$){
# return ngram of the given word
my ($word, $n) = @_;
my %ngram = ();
my @chars = split("",$word);
for(my $i=0; $i<length($word);$i++){
my $str = substr($word,$i,$n);
$ngram{$str}++ if(length($str)==$n);
}
return \%ngram;
}
###############################
# main
my $testFileName = "data/corpus-test.txt";
my $trainFileName = "data/corpus-train.txt";
# Each line is a sentence and a language code at the end
#
my $N = 2; # for ngram
$N = $ARGV[0] if(defined $ARGV[0]);
print "ngram = $N gram\n";
my %lgCodes = ();
my %lgCodeNgramCount = (); # lgCode:ngram-count
my %allNgram = (); # ngram
my %langNgram = (); # lgCode:ngram
my $lineNo = 0;
open (inFile , "<$trainFileName") or die "Cannot open $trainFileName \n";
while(my $line = <inFile>){
chomp($line);
$lineNo++;
my @parts = split(" ", $line);
my $lgCode = pop(@parts);
my $sentence = join(" ", @parts);
$lgCodes{$lgCode}++;
my $ngramHash = ngram($sentence, $N);
foreach my $ng (keys %$ngramHash){
$langNgram{$lgCode}{$ng}++;
$lgCodeNgramCount{$lgCode}++;
$allNgram{$ng}++;
}
}
close(inFile);
print keys(%lgCodes)." languages\n";
foreach my $lg(keys(%lgCodes)){
next if($lg =~ /HASH/);
delete $lgCodes{$lg} if($lgCodes{$lg} == 1);
}
# read test file
my %result = (); # sentence:lgCode
my $precision = 0;
my $correct = 0;
my $totalLines = 0;
open(inFile, "<$testFileName") or die "Cannot open test file:\n $testFileName\n";
while(my $line = <inFile>){
chomp($line);
$totalLines++;
my @tokens = split(" ", $line);
my $lgCode = pop(@tokens);
my $sentence = join(" ", @tokens);
my $ngramHash = ngram($sentence, $N);
my $totalNgramCount = 0;
foreach my $ng(keys %$ngramHash){
$totalNgramCount += $ngramHash->{$ng};
}
my $minDistance;
my $lgRes;
foreach my $lg(keys %lgCodes){
# calcualte the distance of the current sentence from each language
next if($lg =~ /HASH/);
my $distance = 0;
foreach my $ng(keys %$ngramHash){
next if($ng =~ /HASH/);
if ( defined $allNgram{$ng}){ # ignore new ngrams from test data
my $ngCount = 0.5;
$ngCount = $langNgram{$lg}{$ng} if (defined $langNgram{$lg}{$ng});
my $qx = $ngCount / $lgCodeNgramCount{$lg};
my $px = $ngramHash->{$ng} / $totalNgramCount;
$distance += $px * log($px/$qx);
}
}
if (defined $minDistance){
if($distance < $minDistance){
$minDistance = $distance;
$lgRes = $lg;
}
}
else{
$minDistance = $distance;
$lgRes = $lg;
}
}
$correct++ if($lgRes eq $lgCode);
}
close(inFile);
$precision = 100*$correct/$totalLines;
print "Final result: \n";
print " Precision: $precision\n";
print " correct: $correct\n";
print " total: $totalLines\n";