From 559587f42245ca4fef2729fb57e79851a1bfcf25 Mon Sep 17 00:00:00 2001 From: James Taracevicz Date: Fri, 30 Jun 2023 19:41:41 -0700 Subject: [PATCH 1/3] feat: added language_classifier.cc --- binding.gyp | 11 ++++++ src/language_classifier.cc | 70 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 src/language_classifier.cc diff --git a/binding.gyp b/binding.gyp index 9426bcc..737675b 100644 --- a/binding.gyp +++ b/binding.gyp @@ -25,6 +25,17 @@ " +#include +#include + +void LanguageClassifier(const Nan::FunctionCallbackInfo& info) { + v8::Isolate *isolate = info.GetIsolate(); + + if (info.Length() < 1) { + Nan::ThrowTypeError("Usage: language_classifier(text)"); + return; + } + + if (!info[0]->IsString()) { + Nan::ThrowTypeError("First argument must be a string"); + return; + } + + Nan::Utf8String text_utf8(info[0]); + char *text = *text_utf8; + + if (text == NULL) { + Nan::ThrowTypeError("Could not convert first argument to string"); + return; + } + + libpostal_language_classifier_response_t *response = libpostal_classify_language(text); + + if (response != NULL) { + v8::Local lang_array = Nan::New(response->num_languages); + for (size_t i = 0; i < response->num_languages; i++) { + const char* language = response->languages[i]; // Directly access the array + + v8::Local lang_obj = Nan::New(); + Nan::Set(lang_obj, Nan::New("language").ToLocalChecked(), Nan::New(language).ToLocalChecked()); + + Nan::Set(lang_array, i, lang_obj); + } + libpostal_language_classifier_response_destroy(response); + info.GetReturnValue().Set(lang_array); + } +} + +void cleanup(void*) { + libpostal_teardown(); + libpostal_teardown_language_classifier(); +} + +void init(v8::Local exports) { + if (!libpostal_setup() || !libpostal_setup_language_classifier()) { + Nan::ThrowError("Could not load libpostal"); + return; + } + + v8::Local context = exports->CreationContext(); + + exports->Set( + context, + Nan::New("language_classifier").ToLocalChecked(), + Nan::New(LanguageClassifier)->GetFunction(context).ToLocalChecked() + ); + + #if NODE_MAJOR_VERSION >= 12 + node::Environment* env = node::GetCurrentEnvironment(Nan::GetCurrentContext()); + node::AtExit(env, cleanup, NULL); + #else + node::AtExit(cleanup); + #endif +} + +NODE_MODULE(language_classifier, init) From 5ef9546f0afd545cef756d550934fc12f3cb57cb Mon Sep 17 00:00:00 2001 From: James Taracevicz Date: Fri, 30 Jun 2023 19:51:06 -0700 Subject: [PATCH 2/3] feat: included probability in language_classifier --- src/language_classifier.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/language_classifier.cc b/src/language_classifier.cc index 22782e7..45082d1 100644 --- a/src/language_classifier.cc +++ b/src/language_classifier.cc @@ -27,11 +27,14 @@ void LanguageClassifier(const Nan::FunctionCallbackInfo& info) { if (response != NULL) { v8::Local lang_array = Nan::New(response->num_languages); - for (size_t i = 0; i < response->num_languages; i++) { - const char* language = response->languages[i]; // Directly access the array + + for (size_t i = 0; i < response->num_languages; ++i) { + const char *language = response->languages[i]; + const double probability = response->probs[i]; v8::Local lang_obj = Nan::New(); Nan::Set(lang_obj, Nan::New("language").ToLocalChecked(), Nan::New(language).ToLocalChecked()); + Nan::Set(lang_obj, Nan::New("probability").ToLocalChecked(), Nan::New(probability)); Nan::Set(lang_array, i, lang_obj); } From c872942c772808514579bbd372ab83e1c63f94eb Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Wed, 22 Jan 2025 15:42:11 +0100 Subject: [PATCH 3/3] update language_classifier branch --- binding.gyp | 4 +- index.js | 3 +- src/language_classifier.cc | 7 +++- test/index.test.js | 83 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 94 insertions(+), 3 deletions(-) diff --git a/binding.gyp b/binding.gyp index 737675b..2215ff9 100644 --- a/binding.gyp +++ b/binding.gyp @@ -28,7 +28,9 @@ }, { "target_name": "language_classifier", - "sources": [ "src/language_classifier.cc" ], + "sources": [ + "src/language_classifier.cc" + ], "libraries": [ "-lpostal", "-L/usr/local/lib" ], diff --git a/index.js b/index.js index e524f34..593f8e3 100644 --- a/index.js +++ b/index.js @@ -1,5 +1,6 @@ module.exports = { expand: require('bindings')('expand'), - parser: require('bindings')('parser') + parser: require('bindings')('parser'), + language_classifier: require('bindings')('language_classifier').language_classifier } \ No newline at end of file diff --git a/src/language_classifier.cc b/src/language_classifier.cc index 45082d1..e352b5e 100644 --- a/src/language_classifier.cc +++ b/src/language_classifier.cc @@ -54,7 +54,12 @@ void init(v8::Local exports) { return; } - v8::Local context = exports->CreationContext(); + // Check Node.js version + #if NODE_MAJOR_VERSION >= 16 + v8::Local context = exports->GetCreationContext().ToLocalChecked(); + #else + v8::Local context = exports->CreationContext(); + #endif exports->Set( context, diff --git a/test/index.test.js b/test/index.test.js index 5c0b722..0c2694f 100644 --- a/test/index.test.js +++ b/test/index.test.js @@ -56,3 +56,86 @@ describe('parser', function() { }); }) }) + +describe('language_classifier', function() { + it('should classify phrases', function() { + const cases = [ + { + text: 'street', + expected: [{ language: 'en', probability: 0.9975550392228959 }] + }, + { + text: 'calle', + expected: [{ language: 'es', probability: 0.9948278315613933 }] + }, + { + text: '200 santa monica pier santa monica coahuila 90401 usa', + expected: [{ language: 'es', probability: 0.9889375382113144 }] + }, + { + text: '200 santa monica pier santa monica compania 90401 usa', + expected: [ + { language: 'es', probability: 0.6896583016342134 }, + { language: 'it', probability: 0.12355268595569942 }, + { language: 'en', probability: 0.09448842175870104 }, + { language: 'pt', probability: 0.09152280207660235 } + ] + }, + { + text: '200 santa monica pier santa monica compania anonima 90401 usa', + expected: [ + { language: 'en', probability: 0.39249744852100377 }, + { language: 'pt', probability: 0.27690798537637573 }, + { language: 'es', probability: 0.2631088374468901 }, + { language: 'it', probability: 0.06642801798298495 } + ] + }, + { + text: '200 santa monica pier santa monica calle 90401 usa', + expected: [ + { language: 'es', probability: 0.9332794851572307 }, + { language: 'it', probability: 0.0653229950496308 } + ] + }, + { + text: '200 santa monica pier santa monica ca 90401 usa', + expected: [ + { language: 'es', probability: 0.6860753090491215 }, + { language: 'it', probability: 0.16420520901155986 }, + { language: 'en', probability: 0.08797427320205269 }, + { language: 'pt', probability: 0.061694606284459816 } + ] + }, + { + text: '200 santa monica pier santa monica casa 90401 usa', + expected: [ + { language: 'it', probability: 0.7099331948983176 }, + { language: 'pt', probability: 0.1484441794525817 }, + { language: 'en', probability: 0.09171194632862785 } + ] + }, + { + text: '200 santa monica pier santa monica cagliari 90401 usa', + expected: [{ language: 'it', probability: 0.9409304710772296 }] + }, + { + text: '200 santa monica pier santa monica california 90401 usa', + expected: [ + { language: 'en', probability: 0.529208078531959 }, + { language: 'pt', probability: 0.26387306487348855 }, + { language: 'es', probability: 0.20130465470054235 } + ] + }, + { + text: '200 santa monica pier santa monica companhia anonima 90401 usa', + expected: [ + { language: 'pt', probability: 0.8631314862441015 }, + { language: 'en', probability: 0.1090210811124842 } + ] + } + ]; + for (const c of cases) { + assert.deepEqual(postal.language_classifier(c.text), c.expected, c.text); + } + }); +}) \ No newline at end of file