language-classifier/classify.pl at master · arezae/language-classifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/perl -w

# Created: 29 April 2014
# By: Ali Reza Ebadat

# The goal is to classify sentences based on their languages


use strict;
use warnings;

use utf8;
binmode(STDOUT, ":utf8");
binmode(STDIN, ":utf8");


sub ngram($$){
	# return ngram of the given word
	my ($word, $n) = @_;
	my %ngram = ();
	my @chars = split("",$word);
	for(my $i=0; $i<length($word);$i++){
		my $str = substr($word,$i,$n);
		$ngram{$str}++ if(length($str)==$n);
	}
	return \%ngram;
}
###############################
# main
my $testFileName = "data/corpus-test.txt";
my $trainFileName = "data/corpus-train.txt";

# Each line is a sentence and a language code at the end
#

my $N = 2; # for ngram
$N = $ARGV[0] if(defined $ARGV[0]);

print "ngram = $N gram\n";

my %lgCodes = ();
my %lgCodeNgramCount = (); # lgCode:ngram-count
my %allNgram = (); # ngram
my %langNgram = (); # lgCode:ngram
my $lineNo = 0;
open (inFile , "<$trainFileName") or die "Cannot open $trainFileName \n";
while(my $line = <inFile>){
	chomp($line);
	$lineNo++;
	my @parts = split(" ", $line);
	my $lgCode = pop(@parts);
	my $sentence = join(" ", @parts);
	$lgCodes{$lgCode}++;
	my $ngramHash = ngram($sentence, $N);
	foreach my $ng (keys %$ngramHash){
		$langNgram{$lgCode}{$ng}++;
		$lgCodeNgramCount{$lgCode}++;
		$allNgram{$ng}++;
	}
}
close(inFile);
print keys(%lgCodes)." languages\n";

foreach my $lg(keys(%lgCodes)){
	next if($lg =~ /HASH/);
	delete $lgCodes{$lg} if($lgCodes{$lg} == 1);
}
# read test file
my %result = (); # sentence:lgCode
my $precision = 0;
my $correct = 0;
my $totalLines = 0;
open(inFile, "<$testFileName") or die "Cannot open test file:\n $testFileName\n";
while(my $line = <inFile>){
	chomp($line);
	$totalLines++;
	my @tokens = split(" ", $line);
	my $lgCode = pop(@tokens);
	my $sentence = join(" ", @tokens);
	my $ngramHash = ngram($sentence, $N);
	my $totalNgramCount = 0;
	foreach my $ng(keys %$ngramHash){
		$totalNgramCount += $ngramHash->{$ng};
	}
	my $minDistance;
	my $lgRes;
	foreach my $lg(keys %lgCodes){
		# calcualte the distance of the current sentence from each language
		next if($lg =~ /HASH/);
		my $distance = 0;
		foreach my $ng(keys %$ngramHash){
			next if($ng =~ /HASH/);
			if ( defined $allNgram{$ng}){ # ignore new ngrams from test data
				my $ngCount = 0.5;
				$ngCount = $langNgram{$lg}{$ng} if (defined $langNgram{$lg}{$ng});
				my $qx = $ngCount / $lgCodeNgramCount{$lg};
				my $px = $ngramHash->{$ng} / $totalNgramCount;
				$distance += $px * log($px/$qx);
			}
		}
		if (defined $minDistance){
			if($distance < $minDistance){
				$minDistance = $distance;
				$lgRes = $lg;
			}
		}
		else{
			$minDistance = $distance;
			$lgRes = $lg;
		}
	}
	$correct++ if($lgRes eq $lgCode);
}
close(inFile);

$precision = 100*$correct/$totalLines;
print "Final result: \n";
print "     Precision: $precision\n";
print "     correct: $correct\n";
print "     total: $totalLines\n";