From 8e1e7a3aa4efc9f36d688ef0f962fa9336c07bea Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 11 Aug 2016 08:04:55 +0700 Subject: [PATCH 001/109] Fix API documentation --- README.md | 40 +++++++++++++++++++--------------------- fasttext/fasttext.pyx | 4 ++-- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index ba17bcd..2a76379 100644 --- a/README.md +++ b/README.md @@ -26,10 +26,8 @@ and [2](#bag-of-tricks-for-efficient-text-classification). ### Word representation learning In order to learn word vectors, as described in -[1](#enriching-word-vectors-with-subword-information), - -We can use `fasttext.skipgram` and `fasttext.cbow` function -like the following: +[1](#enriching-word-vectors-with-subword-information), we can use +`fasttext.skipgram` and `fasttext.cbow` function like the following: ```python import fasttext @@ -122,23 +120,23 @@ model.get_vector(word) List of params and their default value: ``` -input training file path -output output file path -lr learning rate [0.05] -dim size of word vectors [100] -ws size of the context window [5] -epoch number of epochs [5] -min_count minimal number of word occurences [1] -neg number of negatives sampled [5] -word_ngrams max length of word ngram [1] -loss loss function {ns, hs, softmax} [ns] -bucket number of buckets [2000000] -minn min length of char ngram [3] -maxn max length of char ngram [6] -thread number of threads [12] -verbose how often to print to stdout [10000] -t sampling threshold [0.0001] -silent suspress the log from the C++ extension [1] +input training file path +output output file path +lr learning rate [0.05] +lr_update_rate change the rate of updates for the learning rate [100] +dim size of word vectors [100] +ws size of the context window [5] +epoch number of epochs [5] +min_count minimal number of word occurences [1] +neg number of negatives sampled [5] +word_ngrams max length of word ngram [1] +loss loss function {ns, hs, softmax} [ns] +bucket number of buckets [2000000] +minn min length of char ngram [3] +maxn max length of char ngram [6] +thread number of threads [12] +t sampling threshold [0.0001] +silent suspress the log from the C++ extension [1] ``` ## References diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index 7873637..90fd7c9 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -158,7 +158,7 @@ def _wordvector_model(model_name, input_file, output, lr, dim, ws, epoch, # Learn word representation using skipgram model def skipgram(input_file, output, lr=0.05, dim=100, ws=5, epoch=5, min_count=5, neg=5, word_ngrams=1, loss='ns', bucket=2000000, minn=3, maxn=6, - thread=12, lr_update_rate=10000, t=1e-4, silent=1): + thread=12, lr_update_rate=100, t=1e-4, silent=1): return _wordvector_model('skipgram', input_file, output, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate, t, silent) @@ -166,7 +166,7 @@ def skipgram(input_file, output, lr=0.05, dim=100, ws=5, epoch=5, min_count=5, # Learn word representation using cbow model def cbow(input_file, output, lr=0.05, dim=100, ws=5, epoch=5, min_count=5, neg=5, word_ngrams=1, loss='ns', bucket=2000000, minn=3, maxn=6, - thread=12, lr_update_rate=10000, t=1e-4, silent=1): + thread=12, lr_update_rate=100, t=1e-4, silent=1): return _wordvector_model('cbow', input_file, output, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate, t, silent) From ddb97a0eed4235f5e03013565d0d599d1ba7b193 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 11 Aug 2016 08:17:18 +0700 Subject: [PATCH 002/109] Fix spelling on README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2a76379..85f1327 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ fasttext is a Python interface for ## Requirements fasttext support Python 2.6 or newer. It requires -[Cython](https://pypi.python.org/pypi/Cython/) in order to compile the C++ extension. +[Cython](https://pypi.python.org/pypi/Cython/) in order to build the C++ extension. ## Installation @@ -136,7 +136,7 @@ minn min length of char ngram [3] maxn max length of char ngram [6] thread number of threads [12] t sampling threshold [0.0001] -silent suspress the log from the C++ extension [1] +silent disable the log output from the C++ extension [1] ``` ## References From 8c77787b64906fde70a03a428e107d6a8588a165 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 12 Aug 2016 07:24:47 +0700 Subject: [PATCH 003/109] Fix #10: Support unicode instead of plain bytes --- Makefile | 2 +- fasttext/fasttext.pyx | 47 ++++++++++++++++++++++++------------------ fasttext/interface.cc | 10 ++++++++- fasttext/interface.h | 3 ++- fasttext/interface.pxd | 36 ++++++++++++++++++++------------ fasttext/model.py | 8 +++---- test/params_test.txt | 2 +- test/skipgram_test.py | 17 +++++++++++++-- 8 files changed, 82 insertions(+), 43 deletions(-) diff --git a/Makefile b/Makefile index ee59bf0..d25ed7e 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,7 @@ fasttext/cpp/fasttext: test/skipgram_params_test.bin: ./fasttext/cpp/fasttext skipgram -input test/params_test.txt -output \ test/skipgram_params_test -lr 0.025 -dim 100 -ws 5 -epoch 1 \ - -minCount 5 -neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 \ + -minCount 1 -neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 \ -thread 4 -lrUpdateRate 100 -t 1e-4 test-skipgram: fasttext/cpp/fasttext test/skipgram_params_test.bin diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index 90fd7c9..d40f3a6 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -3,6 +3,7 @@ cimport utils from interface cimport trainWrapper from interface cimport loadModelWrapper from interface cimport FastTextModel +from interface cimport Dictionary # Python/C++ standart libraries from libc.stdlib cimport malloc, free @@ -21,67 +22,67 @@ cdef class FastTextModelWrapper: self.fm = FastTextModel() def get_words(self): - return self.fm.getWords() + return self.words def get_vector(self, word): - word_bytes = bytes(word, 'ascii') + word_bytes = bytes(word, 'utf-8') return self.fm.getVectorWrapper(word_bytes) @property def dim(self): - return self.fm.dim; + return self.fm.dim @property def ws(self): - return self.fm.ws; + return self.fm.ws @property def epoch(self): - return self.fm.epoch; + return self.fm.epoch @property def minCount(self): - return self.fm.minCount; + return self.fm.minCount @property def neg(self): - return self.fm.neg; + return self.fm.neg @property def wordNgrams(self): - return self.fm.wordNgrams; + return self.fm.wordNgrams @property def lossName(self): - return self.fm.lossName; + return self.fm.lossName @property def modelName(self): - return self.fm.modelName; + return self.fm.modelName @property def bucket(self): - return self.fm.bucket; + return self.fm.bucket @property def minn(self): - return self.fm.minn; + return self.fm.minn @property def maxn(self): - return self.fm.maxn; + return self.fm.maxn @property def lrUpdateRate(self): - return self.fm.lrUpdateRate; + return self.fm.lrUpdateRate @property def neg(self): - return self.fm.neg; + return self.fm.neg @property def t(self): - return self.fm.t; + return self.fm.t # load_model: load a word vector model def load_model(filename): @@ -90,13 +91,19 @@ def load_model(filename): raise ValueError('fastText: trained model cannot be opened!') model = FastTextModelWrapper() - filename_bytes = bytes(filename, 'ascii') + filename_bytes = bytes(filename, 'utf-8') loadModelWrapper(filename_bytes, model.fm) + dictionary = model.fm.getDictionary() + cdef string word + words = [] + for i in xrange(dictionary.nwords()): + word = dictionary.getWord(i) + words.append(word.decode('utf-8')) # TODO: handle supervised here model_name = model.fm.modelName if model_name == 'skipgram' or model_name == 'cbow': - return WordVectorModel(model) + return WordVectorModel(model, words) else: raise ValueError('fastText: model name not exists!') @@ -121,7 +128,7 @@ def _wordvector_model(model_name, input_file, output, lr, dim, ws, epoch, utils.initTables() # Setup argv, arguments and their values - py_argv = [b'fasttext', bytes(model_name, 'ascii')] + py_argv = [b'fasttext', bytes(model_name, 'utf-8')] py_args = [b'-input', b'-output', b'-lr', b'-dim', b'-ws', b'-epoch', b'-minCount', b'-neg', b'-wordNgrams', b'-loss', b'-bucket', b'-minn', b'-maxn', b'-thread', b'-lrUpdateRate', b'-t'] @@ -130,7 +137,7 @@ def _wordvector_model(model_name, input_file, output, lr, dim, ws, epoch, for arg, value in zip(py_args, values): py_argv.append(arg) - py_argv.append(bytes(str(value), 'ascii')) + py_argv.append(bytes(str(value), 'utf-8')) argc = len(py_argv) # Converting Python object to C++ diff --git a/fasttext/interface.cc b/fasttext/interface.cc index 8ab6828..80cdffc 100644 --- a/fasttext/interface.cc +++ b/fasttext/interface.cc @@ -19,6 +19,11 @@ std::vector FastTextModel::getWords() return _words; } +Dictionary FastTextModel::getDictionary() +{ + return _dict; +} + void FastTextModel::addWord(std::string word) { _words.push_back(word); @@ -106,9 +111,12 @@ void loadModelWrapper(std::string filename, FastTextModel& model) model.setDict(dict); model.setMatrix(input); - Vector vec(args.dim); + /* Do the indexing on Cython instead to support unicode + * instead of plain bytes */ + /* for(int32_t i = 0; i < dict.nwords(); i++) { std::string word = dict.getWord(i); model.addWord(word); } + */ } diff --git a/fasttext/interface.h b/fasttext/interface.h index 54afb3b..05cdf48 100644 --- a/fasttext/interface.h +++ b/fasttext/interface.h @@ -12,7 +12,6 @@ class FastTextModel { private: std::vector _words; - Dictionary _dict; Matrix _matrix; @@ -39,6 +38,8 @@ class FastTextModel { void setDict(Dictionary dict); void setMatrix(Matrix matrix); void setArg(Args arg); + + Dictionary getDictionary(); }; void trainWrapper(int argc, char **argv, int silent); diff --git a/fasttext/interface.pxd b/fasttext/interface.pxd index 3bb4fc7..b2fa57c 100644 --- a/fasttext/interface.pxd +++ b/fasttext/interface.pxd @@ -1,30 +1,40 @@ # fastText C++ interface from libcpp.string cimport string from libcpp.vector cimport vector +from libc.stdint cimport int32_t cdef extern from "cpp/src/real.h": ctypedef float real +cdef extern from "cpp/src/dictionary.h": + cdef cppclass Dictionary: + Dictionary() + int32_t nwords() + string getWord(int32_t) + cdef extern from "interface.h": cdef cppclass FastTextModel: FastTextModel() - int dim; - int ws; - int epoch; - int minCount; - int neg; - int wordNgrams; - string lossName; - string modelName; - int bucket; - int minn; - int maxn; - int lrUpdateRate; - double t; + int dim + int ws + int epoch + int minCount + int neg + int wordNgrams + string lossName + string modelName + int bucket + int minn + int maxn + int lrUpdateRate + double t vector[string] getWords() vector[real] getVectorWrapper(string word) + Dictionary getDictionary() + void trainWrapper(int argc, char **argvm, int silent) void loadModelWrapper(string filename, FastTextModel& model) + diff --git a/fasttext/model.py b/fasttext/model.py index 0b8fa4f..4c07f74 100644 --- a/fasttext/model.py +++ b/fasttext/model.py @@ -3,17 +3,17 @@ from numpy.linalg import norm class WordVectorModel(object): - def __init__(self, model): + def __init__(self, model, words): self._model = model - self.words = model.get_words() + self.words = words self.dim = model.dim; self.ws = model.ws; self.epoch = model.epoch; self.min_count = model.minCount; self.neg = model.neg; self.word_ngrams = model.wordNgrams; - self.loss_name = model.lossName.decode('ascii'); - self.model_name = model.modelName.decode('ascii'); + self.loss_name = model.lossName.decode('utf-8'); + self.model_name = model.modelName.decode('utf-8'); self.bucket = model.bucket; self.minn = model.minn; self.maxn = model.maxn; diff --git a/test/params_test.txt b/test/params_test.txt index 1c357be..81e5e5e 100644 --- a/test/params_test.txt +++ b/test/params_test.txt @@ -1 +1 @@ - anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive economic institutions anarchists advocate social relations based upon voluntary association of autonomous individuals mutual aid and self governance while anarchism is most easily defined by what it is against anarchists also offer positive visions of what they believe to be a truly free society however ideas about how an anarchist society might work vary considerably especially with respect to economics there is also disagreement about how a free society might be brought about origins and predecessors kropotkin and others argue that before recorded history human society was organized on anarchist principles most anthropologists follow kropotkin and engels in believing that hunter gatherer bands were egalitarian and lacked division of labour accumulated wealth or decreed law and had equal access to resources william godwin anarchists including the the anarchy organisation and rothbard find anarchist attitudes in taoism from ancient china kropotkin found similar ideas in stoic zeno of citium according to kropotkin zeno repudiated the omnipotence of the state its intervention and regimentation and proclaimed the sovereignty of the moral law of the individual the anabaptists of one six th century europe are sometimes considered to be religious forerunners of modern anarchism bertrand russell in his history of western philosophy writes that the anabaptists repudiated all law since they held that the good man will be guided at every moment by the holy spirit from this premise they arrive at communism the diggers or true levellers were an early communistic movement during the time of the english civil war and are considered by some as forerunners of modern anarchism in the modern era the first to use the term to mean something other than chaos was louis armand baron de lahontan in his nouveaux voyages dans l am rique septentrionale one seven zero three where he described the indigenous american society which had no state laws prisons priests or private property as being in anarchy russell means a libertarian and leader in the american indian movement has repeatedly stated that he is an anarchist and so are all his ancestors in one seven nine three in the thick of the french revolution william godwin published an enquiry concerning political justice although godwin did not use the word anarchism many later anarchists have regarded this book as the first major anarchist text and godwin as the founder of philosophical anarchism but at this point no anarchist movement yet existed and the term anarchiste was known mainly as an insult hurled by the bourgeois girondins at more radical elements in the french revolution the first self labelled anarchist pierre joseph proudhon it is commonly held that it wasn t until pierre joseph proudhon published what is property in one eight four zero that the term anarchist was adopted as a self description it is for this reason that some claim proudhon as the founder of modern anarchist theory in what is property proudhon answers with the famous accusation property is theft in this work he opposed the institution of decreed property propri t where owners have complete rights to use and abuse their property as they wish such as exploiting workers for profit in its place proudhon supported what he called possession individuals can have limited rights to use resources capital and goods in accordance with principles of equality and justice proudhon s vision of anarchy which he called mutualism mutuellisme involved an exchange economy where individuals and groups could trade the products of their labor using labor notes which represented the amount of working time involved in production this would ensure that no one would profit from the labor of others workers could freely join together in co operative workshops an interest free bank would be set up to provide everyone with access to the means of production proudhon s ideas were influential within french working class movements and his followers were active in the revolution of one eight four eight in france proudhon s philosophy of property is complex it was developed in a number of works over his lifetime and there are differing interpretations of some of his ideas for more detailed discussion see here max stirner s egoism in his the ego and its own stirner argued that most commonly accepted social institutions including the notion of state property as a right natural rights in general and the very notion of society were mere illusions or ghosts in the mind saying of society that the individuals are its reality he advocated egoism and a form of amoralism in which individuals would unite in associations of egoists only when it was in their self interest to do so for him property simply comes about through might whoever knows how to take to defend the thing to him belongs property and what i have in my power that is my own so long as i assert myself as holder i am the proprietor of the thing stirner never called himself an anarchist he accepted only the label egoist nevertheless his ideas were influential on many individualistically inclined anarchists although interpretations of his thought are diverse american individualist anarchism benjamin tucker in one eight two five josiah warren had participated in a communitarian experiment headed by robert owen called new harmony which failed in a few years amidst much internal conflict warren blamed the community s failure on a lack of individual sovereignty and a lack of private property warren proceeded to organise experimenal anarchist communities which respected what he called the sovereignty of the individual at utopia and modern times in one eight three three warren wrote and published the peaceful revolutionist which some have noted to be the first anarchist periodical ever published benjamin tucker says that warren was the first man to expound and formulate the doctrine now known as anarchism liberty xiv december one nine zero zero one benjamin tucker became interested in anarchism through meeting josiah warren and william b greene he edited and published liberty from august one eight eight one to april one nine zero eight it is widely considered to be the finest individualist anarchist periodical ever issued in the english language tucker s conception of individualist anarchism incorporated the ideas of a variety of theorists greene s ideas on mutual banking warren s ideas on cost as the limit of price a heterodox variety of labour theory of value proudhon s market anarchism max stirner s egoism and herbert spencer s law of equal freedom tucker strongly supported the individual s right to own the product of his or her labour as private property and believed in a market economy for trading this property he argued that in a truly free market system without the state the abundance of competition would eliminate profits and ensure that all workers received the full value of their labor other one nine th century individualists included lysander spooner stephen pearl andrews and victor yarros the first international mikhail bakunin one eight one four one eight seven six in europe harsh reaction followed the revolutions of one eight four eight twenty years later in one eight six four the international workingmen s association sometimes called the first international united some diverse european revolutionary currents including anarchism due to its genuine links to active workers movements the international became signficiant from the start karl marx was a leading figure in the international he was elected to every succeeding general council of the association the first objections to marx came from the mutualists who opposed communism and statism shortly after mikhail bakunin and his followers joined in one eight six eight the first international became polarised into two camps with marx and bakunin as their respective figureheads the clearest difference between the camps was over strategy the anarchists around bakunin favoured in kropotkin s words direct economical struggle against capitalism without interfering in the political parliamentary agitation at that time marx and his followers focused on parliamentary activity bakunin characterised marx s ideas as authoritarian and predicted that if a marxist party gained to power its leaders would end up as bad as the ruling class they had fought against in one eight seven two the conflict climaxed with a final split between the two groups at the hague congress this is often cited as the origin of the conflict between anarchists and marxists from this moment the social democratic and libertarian currents of socialism had distinct organisations including rival internationals anarchist communism peter kropotkin proudhon and bakunin both opposed communism associating it with statism however in the one eight seven zero s many anarchists moved away from bakunin s economic thinking called collectivism and embraced communist concepts communists believed the means of production should be owned collectively and that goods be distributed by need not labor an early anarchist communist was joseph d jacque the first person to describe himself as libertarian unlike proudhon he argued that it is not the product of his or her labor that the worker has a right to but to the satisfaction of his or her needs whatever may be their nature he announced his ideas in his us published journal le libertaire one eight five eight one eight six one peter kropotkin often seen as the most important theorist outlined his economic ideas in the conquest of bread and fields factories and workshops he felt co operation is more beneficial than competition illustrated in nature in mutual aid a factor of evolution one eight nine seven subsequent anarchist communists include emma goldman and alexander berkman many in the anarcho syndicalist movements see below saw anarchist communism as their objective isaac puente s one nine three two comunismo libertario was adopted by the spanish cnt as its manifesto for a post revolutionary society some anarchists disliked merging communism with anarchism several individualist anarchists maintained that abolition of private property was not consistent with liberty for example benjamin tucker whilst professing respect for kropotkin and publishing his work described communist anarchism as pseudo anarchism propaganda of the deed johann most was an outspoken advocate of violence anarchists have often been portrayed as dangerous and violent due mainly to a number of high profile violent acts including riots assassinations insurrections and terrorism by some anarchists some revolutionaries of the late one nine th century encouraged acts of political violence such as bombings and the assassinations of heads of state to further anarchism such actions have sometimes been called propaganda by the deed one of the more outspoken advocates of this strategy was johann most who said the existing system will be quickest and most radically overthrown by the annihilation of its exponents therefore massacres of the enemies of the people must be set in motion most s preferred method of terrorism dynamite earned him the moniker dynamost however there is no consensus on the legitimacy or utility of violence in general mikhail bakunin and errico malatesta for example wrote of violence as a necessary and sometimes desirable force in revolutionary settings but at the same time they denounced acts of individual terrorism malatesta in on violence and bakunin when he refuted nechaev other anarchists sometimes identified as pacifist anarchists advocated complete nonviolence leo tolstoy whose philosophy is often viewed as a form of christian anarchism see below was a notable exponent of nonviolent resistance anarchism in the labour movement the red and black flag coming from the experience of anarchists in the labour movement is particularly associated with anarcho syndicalism anarcho syndicalism was an early two zero th century working class movement seeking to overthrow capitalism and the state to institute a worker controlled society the movement pursued industrial actions such as general strike as a primary strategy many anarcho syndicalists believed in anarchist communism though not all communists believed in syndicalism after the one eight seven one repression french anarchism reemerged influencing the bourses de travails of autonomous workers groups and trade unions from this movement the conf d ration g n rale du travail general confederation of work cgt was formed in one eight nine five as the first major anarcho syndicalist movement emile pataud and emile pouget s writing for the cgt saw libertarian communism developing from a general strike after one nine one four the cgt moved away from anarcho syndicalism due to the appeal of bolshevism french style syndicalism was a significant movement in europe prior to one nine two one and remained a significant movement in spain until the mid one nine four zero s the industrial workers of the world iww founded in one nine zero five in the us espoused unionism and sought a general strike to usher in a stateless society in one nine two three one zero zero zero zero zero members existed with the support of up to three zero zero zero zero zero though not explicitly anarchist they organized by rank and file democracy embodying a spirit of resistance that has inspired many anglophone syndicalists cnt propaganda from april two zero zero four reads don t let the politicians rule our lives you vote and they decide don t allow it unity action self management spanish anarchist trade union federations were formed in the one eight seven zero s one nine zero zero and one nine one zero the most successful was the confederaci n nacional del trabajo national confederation of labour cnt founded in one nine one zero prior to the one nine four zero s the cnt was the major force in spanish working class politics with a membership of one five eight million in one nine three four the cnt played a major role in the spanish civil war see also anarchism in spain syndicalists like ricardo flores mag n were key figures in the mexican revolution latin american anarchism was strongly influenced extending to the zapatista rebellion and the factory occupation movements in argentina in berlin in one nine two two the cnt was joined with the international workers association an anarcho syndicalist successor to the first international contemporary anarcho syndicalism continues as a minor force in many socities much smaller than in the one nine one zero s two zero s and three zero s the largest organised anarchist movement today is in spain in the form of the confederaci n general del trabajo and the cnt the cgt claims a paid up membership of six zero zero zero zero and received over a million votes in spanish syndical elections other active syndicalist movements include the us workers solidarity alliance and the uk solidarity federation the revolutionary industrial unionist industrial workers of the world also exists claiming two zero zero zero paid members contemporary critics of anarcho syndicalism and revolutionary industrial unionism claim that they are workerist and fail to deal with economic life outside work post leftist critics such as bob black claim anarcho syndicalism advocates oppressive social structures such as work and the workplace anarcho syndicalists in general uphold principles of workers solidarity direct action and self management the russian revolution the russian revolution of one nine one seven was a seismic event in the development of anarchism as a movement and as a philosophy anarchists participated alongside the bolsheviks in both february and october revolutions many anarchists initially supporting the bolshevik coup however the bolsheviks soon turned against the anarchists and other left wing opposition a conflict which culminated in the one nine one eight kronstadt rebellion anarchists in central russia were imprisoned or driven underground or joined the victorious bolsheviks in ukraine anarchists fought in the civil war against both whites and bolsheviks within the makhnovshchina peasant army led by nestor makhno expelled american anarchists emma goldman and alexander berkman before leaving russia were amongst those agitating in response to bolshevik policy and the suppression of the kronstadt uprising both wrote classic accounts of their experiences in russia aiming to expose the reality of bolshevik control for them bakunin s predictions about the consequences of marxist rule had proved all too true the victory of the bolsheviks in the october revolution and the resulting russian civil war did serious damage to anarchist movements internationally many workers and activists saw bolshevik success as setting an example communist parties grew at the expense of anarchism and other socialist movements in france and the us for example the major syndicalist movements of the cgt and iww began to realign themselves away from anarchism and towards the communist international in paris the dielo truda group of russian anarchist exiles which included nestor makhno concluded that anarchists needed to develop new forms of organisation in response to the structures of bolshevism their one nine two six manifesto known as the organisational platform of the libertarian communists was supported by some communist anarchists though opposed by many others the platform continues to inspire some contemporary anarchist groups who believe in an anarchist movement organised around its principles of theoretical unity tactical unity collective responsibility and federalism platformist groups today include the workers solidarity movement in ireland the uk s anarchist federation and the late north eastern federation of anarchist communists in the northeastern united states and bordering canada the fight against fascism spain one nine three six members of the cnt construct armoured cars to fight against the fascists in one of the collectivised factories in the one nine two zero s and one nine three zero s the familiar dynamics of anarchism s conflict with the state were transformed by the rise of fascism in europe in many cases european anarchists faced difficult choices should they join in popular fronts with reformist democrats and soviet led communists against a common fascist enemy luigi fabbri an exile from italian fascism was amongst those arguing that fascism was something different fascism is not just another form of government which like all others uses violence it is the most authoritarian and the most violent form of government imaginable it represents the utmost glorification of the theory and practice of the principle of authority in france where the fascists came close to insurrection in the february one nine three four riots anarchists divided over a united front policy in spain the cnt initially refused to join a popular front electoral alliance and abstention by cnt supporters led to a right wing election victory but in one nine three six the cnt changed its policy and anarchist votes helped bring the popular front back to power months later the ruling class responded with an attempted coup and the spanish civil war one nine three six three nine was underway in reponse to the army rebellion an anarchist inspired movement of peasants and workers supported by armed militias took control of the major city of barcelona and of large areas of rural spain where they collectivized the land but even before the eventual fascist victory in one nine three nine the anarchists were losing ground in a bitter struggle with the stalinists the cnt leadership often appeared confused and divided with some members controversially entering the government stalinist led troops suppressed the collectives and persecuted both dissident marxists and anarchists since the late one nine seven zero s anarchists have been involved in fighting the rise of neo fascist groups in germany and the united kingdom some anarchists worked within militant anti fascist groups alongside members of the marxist left they advocated directly combating fascists with physical force rather than relying on the state since the late one nine nine zero s a similar tendency has developed within us anarchism see also anti racist action us anti fascist action uk antifa religious anarchism leo tolstoy one eight two eight one nine one zero most anarchist culture tends to be secular if not outright anti religious however the combination of religious social conscience historical religiousity amongst oppressed social classes and the compatibility of some interpretations of religious traditions with anarchism has resulted in religious anarchism christian anarchists believe that there is no higher authority than god and oppose earthly authority such as government and established churches they believe that jesus teachings were clearly anarchistic but were corrupted when christianity was declared the official religion of rome christian anarchists who follow jesus directive to turn the other cheek are strict pacifists the most famous advocate of christian anarchism was leo tolstoy author of the kingdom of god is within you who called for a society based on compassion nonviolent principles and freedom christian anarchists tend to form experimental communities they also occasionally resist taxation many christian anarchists are vegetarian or vegan christian anarchy can be said to have roots as old as the religion s birth as the early church exhibits many anarchistic tendencies such as communal goods and wealth by aiming to obey utterly certain of the bible s teachings certain anabaptist groups of sixteenth century europe attempted to emulate the early church s social economic organisation and philosophy by regarding it as the only social structure capable of true obediance to jesus teachings and utterly rejected in theory all earthly hierarchies and authority and indeed non anabaptists in general and violence as ungodly such groups for example the hutterites typically went from initially anarchistic beginnings to as their movements stabalised more authoritarian social models chinese anarchism was most influential in the one nine two zero s strands of chinese anarchism included tai xu s buddhist anarchism which was influenced by tolstoy and the well field system neopaganism with its focus on the environment and equality along with its often decentralized nature has lead to a number of neopagan anarchists one of the most prominent is starhawk who writes extensively about both spirituality and activism anarchism and feminism emma goldman early french feminists such as jenny d h ricourt and juliette adam criticised the mysogyny in the anarchism of proudhon during the one eight five zero s anarcha feminism is a kind of radical feminism that espouses the belief that patriarchy is a fundamental problem in society while anarchist feminism has existed for more than a hundred years its explicit formulation as anarcha feminism dates back to the early seven zero s during the second wave feminist movement anarcha feminism views patriarchy as the first manifestation of hierarchy in human history thus the first form of oppression occurred in the dominance of male over female anarcha feminists then conclude that if feminists are against patriarchy they must also be against all forms of hierarchy and therefore must reject the authoritarian nature of the state and capitalism anarcho primitivists see the creation of gender roles and patriarchy a creation of the start of civilization and therefore consider primitivism to also be an anarchist school of thought that addresses feminist concerns eco feminism is often considered a feminist variant of green anarchist feminist thought anarcha feminism is most often associated with early two zero th century authors and theorists such as emma goldman and voltairine de cleyre although even early first wave feminist mary wollstonecraft held proto anarchist views and william godwin is often considered a feminist anarchist precursor it should be noted that goldman and de cleyre though they both opposed the state had opposing philosophies as de cleyre explains miss goldman is a communist i am an individualist she wishes to destroy the right of property i wish to assert it i make my war upon privilege and authority whereby the right of property the true right in that which is proper to the individual is annihilated she believes that co operation would entirely supplant competition i hold that competition in one form or another will always exist and that it is highly desirable it should in the spanish civil war an anarcha feminist group free women organized to defend both anarchist and feminist ideas in the modern day anarchist movement many anarchists male or female consider themselves feminists and anarcha feminist ideas are growing the publishing of quiet rumors an anarcha feminist reader has helped to spread various kinds of anti authoritarian and anarchist feminist ideas to the broader movement wendy mcelroy has popularized an individualist anarchism take on feminism in her books articles and individualist feminist website anarcho capitalism murray rothbard one nine two six one nine nine five anarcho capitalism is a predominantly united states based theoretical tradition that desires a stateless society with the economic system of free market capitalism unlike other branches of anarchism it does not oppose profit or capitalism consequently most anarchists do not recognise anarcho capitalism as a form of anarchism murray rothbard s synthesis of classical liberalism and austrian economics was germinal for the development of contemporary anarcho capitalist theory he defines anarcho capitalism in terms of the non aggression principle based on the concept of natural law competiting theorists use egoism utilitarianism used by david friedman or contractarianism used by jan narveson some minarchists such as ayn rand robert nozick and robert a heinlein have influenced anarcho capitalism some anarcho capitalists along with some right wing libertarian historians such as david hart and ralph raico considered similar philosophies existing prior to rothbard to be anarcho capitalist such as those of gustave de molinari and auberon herbert opponents of anarcho capitalists dispute these claims the place of anarcho capitalism within anarchism and indeed whether it is a form of anarchism at all is highly controversial for more on this debate see anarchism and anarcho capitalism anarchism and the environment since the late one nine seven zero s anarchists in anglophone and european countries have been taking action for the natural environment eco anarchists or green anarchists believe in deep ecology this is a worldview that embraces biodiversity and sustainability eco anarchists often use direct action against what they see as earth destroying institutions of particular importance is the earth first movement that takes action such as tree sitting another important component is ecofeminism which sees the domination of nature as a metaphor for the domination of women green anarchism also involves a critique of industrial capitalism and for some green anarchists civilization itself primitivism is a predominantly western philosophy that advocates a return to a pre industrial and usually pre agricultural society it develops a critique of industrial civilization in this critique technology and development have alienated people from the natural world this philosophy develops themes present in the political action of the luddites and the writings of jean jacques rousseau primitivism developed in the context of the reclaim the streets earth first and the earth liberation front movements john zerzan wrote that civilization not just the state would need to fall for anarchy to be achieved anarcho primitivists point to the anti authoritarian nature of many primitive or hunter gatherer societies throughout the world s history as examples of anarchist societies other branches and offshoots anarchism generates many eclectic and syncretic philosophies and movements since the western social formet in the one nine six zero s and one nine seven zero s a number new of movements and schools have appeared most of these stances are limited to even smaller numbers than the schools and movements listed above hakim bey post left anarchy post left anarchy also called egoist anarchism seeks to distance itself from the traditional left communists liberals social democrats etc and to escape the confines of ideology in general post leftists argue that anarchism has been weakened by its long attachment to contrary leftist movements and single issue causes anti war anti nuclear etc it calls for a synthesis of anarchist thought and a specifically anti authoritarian revolutionary movement outside of the leftist milieu it often focuses on the individual rather than speaking in terms of class or other broad generalizations and shuns organizational tendencies in favor of the complete absence of explicit hierarchy important groups and individuals associated with post left anarchy include crimethinc the magazine anarchy a journal of desire armed and its editor jason mcquinn bob black hakim bey and others for more information see infoshop org s anarchy after leftism section and the post left section on anarchism ws see also post left anarchy post structuralism the term postanarchism was originated by saul newman first receiving popular attention in his book from bakunin to lacan to refer to a theoretical move towards a synthesis of classical anarchist theory and poststructuralist thought subsequent to newman s use of the term however it has taken on a life of its own and a wide range of ideas including autonomism post left anarchy situationism post colonialism and zapatismo by its very nature post anarchism rejects the idea that it should be a coherent set of doctrines and beliefs as such it is difficult if not impossible to state with any degree of certainty who should or shouldn t be grouped under the rubric nonetheless key thinkers associated with post anarchism include saul newman todd may gilles deleuze and f lix guattari external reference postanarchism clearinghouse see also post anarchism insurrectionary anarchism insurrectionary anarchism is a form of revolutionary anarchism critical of formal anarchist labor unions and federations insurrectionary anarchists advocate informal organization including small affinity groups carrying out acts of resistance in various struggles and mass organizations called base structures which can include exploited individuals who are not anarchists proponents include wolfi landstreicher and alfredo m bonanno author of works including armed joy and the anarchist tension this tendency is represented in the us in magazines such as willful disobedience and killing king abacus see also insurrectionary anarchism small a anarchism small a anarchism is a term used in two different but not unconnected contexts dave neal posited the term in opposition to big a anarchism in the article anarchism ideology or methodology while big a anarchism referred to ideological anarchists small a anarchism was applied to their methodological counterparts those who viewed anarchism as a way of acting or a historical tendency against illegitimate authority as an anti ideological position small a anarchism shares some similarities with post left anarchy david graeber and andrej grubacic offer an alternative use of the term applying it to groups and movements organising according to or acting in a manner consistent with anarchist principles of decentralisation voluntary association mutual aid the network model and crucially the rejection of any idea that the end justifies the means let alone that the business of a revolutionary is to seize state power and then begin imposing one s vision at the point of a gun other issues conceptions of an anarchist society many political philosophers justify support of the state as a means of regulating violence so that the destruction caused by human conflict is minimized and fair relationships are established anarchists argue that pursuit of these ends does not justify the establishment of a state many argue that the state is incompatible with those goals and the cause of chaos violence and war anarchists argue that the state helps to create a monopoly on violence and uses violence to advance elite interests much effort has been dedicated to explaining how anarchist societies would handle criminality see also anarchism and society civil rights and cultural sovereignty black anarchism opposes the existence of a state capitalism and subjugation and domination of people of color and favors a non hierarchical organization of society theorists include ashanti alston lorenzo komboa ervin and sam mbah anarchist people of color was created as a forum for non caucasian anarchists to express their thoughts about racial issues within the anarchist movement particularly within the united states national anarchism is a political view which seeks to unite cultural or ethnic preservation with anarchist views its adherents propose that those preventing ethnic groups or races from living in separate autonomous groupings should be resisted anti racist action is not an anarchist group but many anarchists are involved it focuses on publicly confronting racist agitators the zapatista movement of chiapas mexico is a cultural sovereignty group with some anarchist proclivities neocolonialism and globalization nearly all anarchists oppose neocolonialism as an attempt to use economic coercion on a global scale carried out through state institutions such as the world bank world trade organization group of eight and the world economic forum globalization is an ambiguous term that has different meanings to different anarchist factions most anarchists use the term to mean neocolonialism and or cultural imperialism which they may see as related many are active in the anti globalization movement others particularly anarcho capitalists use globalization to mean the worldwide expansion of the division of labor and trade which they see as beneficial so long as governments do not intervene parallel structures many anarchists try to set up alternatives to state supported institutions and outposts such as food not bombs infoshops educational systems such as home schooling neighborhood mediation arbitration groups and so on the idea is to create the structures for a new anti authoritarian society in the shell of the old authoritarian one technology recent technological developments have made the anarchist cause both easier to advance and more conceivable to people many people use the internet to form on line communities intellectual property is undermined and a gift culture supported by sharing music files open source programming and the free software movement these cyber communities include the gnu linux indymedia and wiki some anarchists see information technology as the best weapon to defeat authoritarianism some even think the information age makes eventual anarchy inevitable see also crypto anarchism and cypherpunk pacifism some anarchists consider pacifism opposition to war to be inherent in their philosophy anarcho pacifists take it further and follow leo tolstoy s belief in non violence anarchists see war as an activity in which the state seeks to gain and consolidate power both domestically and in foreign lands and subscribe to randolph bourne s view that war is the health of the state a lot of anarchist activity has been anti war based parliamentarianism in general terms the anarchist ethos opposes voting in elections because voting amounts to condoning the state voluntaryism is an anarchist school of thought which emphasizes tending your own garden and neither ballots nor bullets the anarchist case against voting is explained in the ethics of voting by george h smith also see voting anarchists an oxymoron or what by joe peacott and writings by fred woodworth sectarianism most anarchist schools of thought are to some degree sectarian there is often a difference of opinion within each school about how to react to or interact with other schools some such as panarchists believe that it is possible for a variety of modes of social life to coexist and compete some anarchists view opposing schools as a social impossibility and resist interaction others see opportunities for coalition building or at least temporary alliances for specific purposes see anarchism without adjectives criticisms of anarchism main article criticisms of anarchism violence since anarchism has often been associated with violence and destruction some people have seen it as being too violent on the other hand hand frederick engels criticsed anarchists for not being violent enough a revolution is certainly the most authoritarian thing there is it is the act whereby one part of the population imposes its will upon the other part by means of rifles bayonets and cannon authoritarian means if such there be at all and if the victorious party does not want to have fought in vain it must maintain this rule by means of the terror which its arms inspire in the reactionists would the paris commune have lasted a single day if it had not made use of this authority of the armed people against the bourgeois utopianism anarchism is often criticised as unfeasible or plain utopian even by many who agree that it s a nice idea in principle for example carl landauer in his book european socialism criticizes anarchism as being unrealistically utopian and holds that government is a lesser evil than a society without repressive force he holds that the belief that ill intentions will cease if repressive force disappears is an absurdity however it must be noted that not all anarchists have such a utopian view of anarchism for example some such as benjamin tucker advocate privately funded institutions that defend individual liberty and property however other anarchists such as sir herbert read proudly accept the characterization utopian class character marxists have characterised anarchism as an expression of the class interests of the petite bourgeoisie or perhaps the lumpenproletariat see e g plekhanov for a marxist critique of one eight nine five anarchists have also been characterised as spoilt middle class dilettantes most recently in relation to anti capitalist protesters tacit authoritarianism in recent decades anarchism has been criticised by situationists post anarchists and others of preserving tacitly statist authoritarian or bureaucratic tendencies behind a dogmatic facade hypocrisy some critics point to the sexist and racist views of some prominent anarchists notably proudhon and bakunin as examples of hypocrisy inherent within anarchism while many anarchists however dismiss that the personal prejudices of one nine th century theorists influence the beliefs of present day anarchists others criticise modern anarchism for continuing to be eurocentric and reference the impact of anarchist thinkers like proudhon on fascism through groups like cercle proudhon anarcho capitalist bryan caplan argues that the treatment of fascists and suspected fascist sympathizers by spanish anarchists in the spanish civil war was a form of illegitimate coercion making the proffessed anarchists ultimately just a third faction of totalitarians alongside the communists and fascists he also criticizes the willingness of the cnt to join the statist republican government during the civil war and references stanley g payne s book on the franco regime which claims that the cnt entered negotiations with the fascist government six years after the war cultural phenomena noam chomsky one nine two eight the kind of anarchism that is most easily encountered in popular culture is represented by celebrities who publicly identify themselves as anarchists although some anarchists reject any focus on such famous living individuals as inherently litist the following figures are examples of prominent publicly self avowed anarchists the mit professor of linguistics noam chomsky the science fiction author ursula k le guin the social historian howard zinn entertainer and author hans alfredsson the avant garde artist nicol s rossell in denmark the freetown christiania was created in downtown copenhagen the housing and employment crisis in most of western europe led to the formation of communes and squatter movements like the one still thriving in barcelona in catalonia militant resistance to neo nazi groups in places like germany and the uprisings of autonomous marxism situationist and autonomist groups in france and italy also helped to give popularity to anti authoritarian non capitalist ideas in various musical styles anarchism rose in popularity most famous for the linking of anarchist ideas and music has been punk rock although in the modern age hip hop and folk music are also becoming important mediums for the spreading of the anarchist message in the uk this was associated with the punk rock movement the band crass is celebrated for its anarchist and pacifist ideas the dutch punk band the ex further exemplifies this expression for further details see anarcho punk see also there are many concepts relevant to the topic of anarchism this is a brief summary there is also a more extensive list of anarchist concepts individualist anarchism anarcho communism anarcho syndicalism anarcho capitalism mutualism christian anarchism anarcha feminism green anarchism nihilist anarchism anarcho nationalism black anarchism national anarchism post anarchism post left anarchism libertarian socialism anarchist symbolism list of anarchism links list of anarchists list of anarchist organizations major conflicts within anarchist thought past and present anarchist communities historical events paris commune one eight seven one haymarket riot one eight eight six the makhnovschina one nine one seven one nine two one kronstadt rebellion one nine two one spanish revolution one nine three six see anarchism in spain and spanish revolution may one nine six eight france one nine six eight wto meeting in seattle one nine nine nine books the following is a sample of books that have been referenced in this page a more complete list can be found at the list of anarchist books mikhail bakunin god and the state emma goldman anarchism other essays peter kropotkin mutual aid pierre joseph proudhon what is property rudolf rocker anarcho syndicalism murray rothbard the ethics of liberty max stirner the ego and its own leo tolstoy the kingdom of god is within you anarchism by region culture african anarchism anarchism in spain anarchism in the english tradition chinese anarchism references these notes have no corresponding reference in the article they might be re used against politics appleton boston anarchists yarros victor liberty vii january two one eight nine two noam chomsky on anarchism by noam chomsky external links the overwhelming diversity and number of links relating to anarchism is extensively covered on the links subpage anarchoblogs blogs by anarchists anarchy archives extensively archives information relating to famous anarchists this includes many of their books and other publications hundreds of anarchists are listed with short bios links dedicated pages at the daily bleed s anarchist encyclopedia infoshop org wikipedia page industrial workers of the world anarchism forms of government political ideology entry points political theories social philosophy autism is classified as a neurodevelopmental disorder that manifests itself in markedly abnormal social interaction communication ability patterns of interests and patterns of behavior although the specific etiology of autism is unknown many researchers suspect that autism results from genetically mediated vulnerabilities to environmental triggers and while there is disagreement about the magnitude nature and mechanisms for such environmental factors researchers have found at least seven major genes prevalent among individuals diagnosed as autistic some estimate that autism occurs in as many as one united states child in one six six however the national institute of mental health gives a more conservative estimate of one in one zero zero zero for families that already have one autistic child the odds of a second autistic child may be as high as one in twenty diagnosis is based on a list of psychiatric criteria and a series of standardized clinical tests may also be used autism may not be physiologically obvious a complete physical and neurological evaluation will typically be part of diagnosing autism some now speculate that autism is not a single condition but a group of several distinct conditions that manifest in similar ways by definition autism must manifest delays in social interaction language as used in social communication or symbolic or imaginative play with onset prior to age three years according to the diagnostic and statistical manual of mental disorders the icd one zero also says that symptoms must manifest before the age of three years there have been large increases in the reported incidence of autism for reasons that are heavily debated by researchers in psychology and related fields within the scientific community some children with autism have improved their social and other skills to the point where they can fully participate in mainstream education and social events but there are lingering concerns that an absolute cure from autism is impossible with current technology however many autistic children and adults who are able to communicate at least in writing are opposed to attempts to cure their conditions and see such conditions as part of who they are history dr hans asperger described a form of autism in the one nine four zero s that later became known as asperger s syndrome the word autism was first used in the english language by swiss psychiatrist eugene bleuler in a one nine one two number of the american journal of insanity it comes from the greek word for self however the classification of autism did not occur until the middle of the twentieth century when in one nine four three psychiatrist dr leo kanner of the johns hopkins hospital in baltimore reported on one one child patients with striking behavioral similarities and introduced the label early infantile autism he suggested autism from the greek autos meaning self to describe the fact that the children seemed to lack interest in other people although kanner s first paper on the subject was published in a now defunct journal the nervous child almost every characteristic he originally described is still regarded as typical of the autistic spectrum of disorders at the same time an austrian scientist dr hans asperger described a different form of autism that became known as asperger s syndrome but the widespread recognition of asperger s work was delayed by world war ii in germany and by the fact that his seminal paper wasn t translated into english for almost five zero years the majority of his work wasn t widely read until one nine nine seven thus these two conditions were described and are today listed in the diagnostic and statistical manual of mental disorders dsm iv tr fourth edition text revision one as two of the five pervasive developmental disorders pdd more often referred to today as autism spectrum disorders asd all of these conditions are characterized by varying degrees of difference in communication skills social interactions and restricted repetitive and stereotyped patterns of behavior few clinicians today solely use the dsm iv criteria for determining a diagnosis of autism which are based on the absence or delay of certain developmental milestones many clinicians instead use an alternate means or a combination thereof to more accurately determine a diagnosis terminology when referring to someone diagnosed with autism the term autistic is often used however the term person with autism can be used instead this is referred to as person first terminology the autistic community generally prefers the term autistic for reasons that are fairly controversial this article uses the term autistic see talk page characteristics dr leo kanner introduced the label early infantile autism in one nine four three there is a great diversity in the skills and behaviors of individuals diagnosed as autistic and physicians will often arrive at different conclusions about the appropriate diagnosis much of this is due to the sensory system of an autistic which is quite different from the sensory system of other people since certain stimulations can affect an autistic differently than a non autistic and the degree to which the sensory system is affected varies wildly from one autistic person to another nevertheless professionals within pediatric care and development often look for early indicators of autism in order to initiate treatment as early as possible however some people do not believe in treatment for autism either because they do not believe autism is a disorder or because they believe treatment can do more harm than good social development typically developing infants are social beings early in life they do such things as gaze at people turn toward voices grasp a finger and even smile in contrast most autistic children prefer objects to faces and seem to have tremendous difficulty learning to engage in the give and take of everyday human interaction even in the first few months of life many seem indifferent to other people because they avoid eye contact and do not interact with them as often as non autistic children children with autism often appear to prefer being alone to the company of others and may passively accept such things as hugs and cuddling without reciprocating or resist attention altogether later they seldom seek comfort from others or respond to parents displays of anger or affection in a typical way research has suggested that although autistic children are attached to their parents their expression of this attachment is unusual and difficult to interpret parents who looked forward to the joys of cuddling teaching and playing with their child may feel crushed by this lack of expected attachment behavior children with autism appear to lack theory of mind the ability to see things from another person s perspective a behavior cited as exclusive to human beings above the age of five and possibly other higher primates such as adult gorillas chimpanzees and bonobos typical five year olds can develop insights into other people s different knowledge feelings and intentions interpretations based upon social cues e g gestures facial expressions an individual with autism seems to lack these interpretation skills an inability that leaves them unable to predict or understand other people s actions the social alienation of autistic and asperger s people is so intense from childhood that many of them have imaginary friends as companionship however having an imaginary friend is not necessarily a sign of autism and also occurs in non autistic children although not universal it is common for autistic people to not regulate their behavior this can take the form of crying or verbal outbursts that may seem out of proportion to the situation individuals with autism generally prefer consistent routines and environments they may react negatively to changes in them it is not uncommon for these individuals to exhibit aggression increased levels of self stimulatory behavior self injury or extensive withdrawal in overwhelming situations sensory system a key indicator to clinicians making a proper assessment for autism would include looking for symptoms much like those found in sensory integration dysfunction children will exhibit problems coping with the normal sensory input indicators of this disorder include oversensitivity or underreactivity to touch movement sights or sounds physical clumsiness or carelessness poor body awareness a tendency to be easily distracted impulsive physical or verbal behavior an activity level that is unusually high or low not unwinding or calming oneself difficulty learning new movements difficulty in making transitions from one situation to another social and or emotional problems delays in speech language or motor skills specific learning difficulties delays in academic achievement one common example is an individual with autism hearing a person with autism may have trouble hearing certain people while other people are louder than usual or the person with autism may be unable to filter out sounds in certain situations such as in a large crowd of people see cocktail party effect however this is perhaps the part of the autism that tends to vary the most from person to person so these examples may not apply to every autistic it should be noted that sensory difficulties although reportedly common in autistics are not part of the dsm iv diagnostic criteria for autistic disorder communication difficulties by age three typical children have passed predictable language learning milestones one of the earliest is babbling by the first birthday a typical toddler says words turns when he or she hears his or her name points when he or she wants a toy and when offered something distasteful makes it clear that the answer is no speech development in people with autism takes different paths some remain mute throughout their lives while being fully literate and able to communicate in other ways images sign language and typing are far more natural to them some infants who later show signs of autism coo and babble during the first few months of life but stop soon afterwards others may be delayed developing language as late as the teenage years still inability to speak does not mean that people with autism are unintelligent or unaware once given appropriate accommodations many will happily converse for hours and can often be found in online chat rooms discussion boards or websites and even using communication devices at autism community social events such as autreat those who do speak often use language in unusual ways retaining features of earlier stages of language development for long periods or throughout their lives some speak only single words while others repeat the same phrase over and over some repeat what they hear a condition called echolalia sing song repetitions in particular are a calming joyous activity that many autistic adults engage in many people with autism have a strong tonal sense and can often understand spoken language some children may exhibit only slight delays in language or even seem to have precocious language and unusually large vocabularies but have great difficulty in sustaining typical conversations the give and take of non autistic conversation is hard for them although they often carry on a monologue on a favorite subject giving no one else an opportunity to comment when given the chance to converse with other autistics they comfortably do so in parallel monologue taking turns expressing views and information just as neurotypicals people without autism have trouble understanding autistic body languages vocal tones or phraseology people with autism similarly have trouble with such things in people without autism in particular autistic language abilities tend to be highly literal people without autism often inappropriately attribute hidden meaning to what people with autism say or expect the person with autism to sense such unstated meaning in their own words the body language of people with autism can be difficult for other people to understand facial expressions movements and gestures may be easily understood by some other people with autism but do not match those used by other people also their tone of voice has a much more subtle inflection in reflecting their feelings and the auditory system of a person without autism often cannot sense the fluctuations what seems to non autistic people like a high pitched sing song or flat robot like voice is common in autistic children some autistic children with relatively good language skills speak like little adults rather than communicating at their current age level which is one of the things that can lead to problems since non autistic people are often unfamiliar with the autistic body language and since autistic natural language may not tend towards speech autistic people often struggle to let other people know what they need as anybody might do in such a situation they may scream in frustration or resort to grabbing what they want while waiting for non autistic people to learn to communicate with them people with autism do whatever they can to get through to them communication difficulties may contribute to autistic people becoming socially anxious or depressed repetitive behaviors although people with autism usually appear physically normal and have good muscle control unusual repetitive motions known as self stimulation or stimming may set them apart these behaviors might be extreme and highly apparent or more subtle some children and older individuals spend a lot of time repeatedly flapping their arms or wiggling their toes others suddenly freeze in position as children they might spend hours lining up their cars and trains in a certain way not using them for pretend play if someone accidentally moves one of these toys the child may be tremendously upset autistic children often need and demand absolute consistency in their environment a slight change in any routine in mealtimes dressing taking a bath or going to school at a certain time and by the same route can be extremely disturbing people with autism sometimes have a persistent intense preoccupation for example the child might be obsessed with learning all about vacuum cleaners train schedules or lighthouses often they show great interest in different languages numbers symbols or science topics repetitive behaviors can also extend into the spoken word as well perseveration of a single word or phrase even for a specific number of times can also become a part of the child s daily routine effects in education children with autism are affected with these symptoms every day these unusual characteristics set them apart from the everyday normal student because they have trouble understanding people s thoughts and feelings they have trouble understanding what their teacher may be telling them they do not understand that facial expressions and vocal variations hold meanings and may misinterpret what emotion their instructor is displaying this inability to fully decipher the world around them makes education stressful teachers need to be aware of a student s disorder so that they are able to help the student get the best out of the lessons being taught some students learn better with visual aids as they are better able to understand material presented this way because of this many teachers create visual schedules for their autistic students this allows the student to know what is going on throughout the day so they know what to prepare for and what activity they will be doing next some autistic children have trouble going from one activity to the next so this visual schedule can help to reduce stress research has shown that working in pairs may be beneficial to autistic children autistic students have problems in schools not only with language and communication but with socialization as well they feel self conscious about themselves and many feel that they will always be outcasts by allowing them to work with peers they can make friends which in turn can help them cope with the problems that arise by doing so they can become more integrated into the mainstream environment of the classroom a teacher s aide can also be useful to the student the aide is able to give more elaborate directions that the teacher may not have time to explain to the autistic child the aide can also facilitate the autistic child in such a way as to allow them to stay at a similar level to the rest of the class this allows a partially one on one lesson structure so that the child is still able to stay in a normal classroom but be given the extra help that they need there are many different techniques that teachers can use to assist their students a teacher needs to become familiar with the child s disorder to know what will work best with that particular child every child is going to be different and teachers have to be able to adjust with every one of them students with autism spectrum disorders typically have high levels of anxiety and stress particularly in social environments like school if a student exhibits aggressive or explosive behavior it is important for educational teams to recognize the impact of stress and anxiety preparing students for new situations by writing social stories can lower anxiety teaching social and emotional concepts using systematic teaching approaches such as the incredible five point scale or other cognitive behavioral strategies can increase a student s ability to control excessive behavioral reactions dsm definition autism is defined in section two nine nine zero zero of the diagnostic and statistical manual of mental disorders dsm iv as a total of six or more items from one two and three with at least two from one and one each from two and three qualitative impairment in social interaction as manifested by at least two of the following marked impairment in the use of multiple nonverbal behaviors such as eye to eye gaze facial expression body postures and gestures to regulate social interaction failure to develop peer relationships appropriate to developmental level a lack of spontaneous seeking to share enjoyment interests or achievements with other people e g by a lack of showing bringing or pointing out objects of interest lack of social or + anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive economic institutions anarchists advocate social relations based upon voluntary association of autonomous individuals mutual aid and self governance while anarchism is most easily defined by what it is against anarchists also offer positive visions of what they believe to be a truly free society however ideas about how an anarchist society might work vary considerably especially with respect to economics there is also disagreement about how a free society might be brought about origins and predecessors kropotkin and others argue that before recorded history human society was organized on anarchist principles most anthropologists follow kropotkin and engels in believing that hunter gatherer bands were egalitarian and lacked division of labour accumulated wealth or decreed law and had equal access to resources william godwin anarchists including the the anarchy organisation and rothbard find anarchist attitudes in taoism from ancient china kropotkin found similar ideas in stoic zeno of citium according to kropotkin zeno repudiated the omnipotence of the state its intervention and regimentation and proclaimed the sovereignty of the moral law of the individual the anabaptists of one six th century europe are sometimes considered to be religious forerunners of modern anarchism bertrand russell in his history of western philosophy writes that the anabaptists repudiated all law since they held that the good man will be guided at every moment by the holy spirit from this premise they arrive at communism the diggers or true levellers were an early communistic movement during the time of the english civil war and are considered by some as forerunners of modern anarchism in the modern era the first to use the term to mean something other than chaos was louis armand baron de lahontan in his nouveaux voyages dans l am rique septentrionale one seven zero three where he described the indigenous american society which had no state laws prisons priests or private property as being in anarchy russell means a libertarian and leader in the american indian movement has repeatedly stated that he is an anarchist and so are all his ancestors in one seven nine three in the thick of the french revolution william godwin published an enquiry concerning political justice although godwin did not use the word anarchism many later anarchists have regarded this book as the first major anarchist text and godwin as the founder of philosophical anarchism but at this point no anarchist movement yet existed and the term anarchiste was known mainly as an insult hurled by the bourgeois girondins at more radical elements in the french revolution the first self labelled anarchist pierre joseph proudhon it is commonly held that it wasn t until pierre joseph proudhon published what is property in one eight four zero that the term anarchist was adopted as a self description it is for this reason that some claim proudhon as the founder of modern anarchist theory in what is property proudhon answers with the famous accusation property is theft in this work he opposed the institution of decreed property propri t where owners have complete rights to use and abuse their property as they wish such as exploiting workers for profit in its place proudhon supported what he called possession individuals can have limited rights to use resources capital and goods in accordance with principles of equality and justice proudhon s vision of anarchy which he called mutualism mutuellisme involved an exchange economy where individuals and groups could trade the products of their labor using labor notes which represented the amount of working time involved in production this would ensure that no one would profit from the labor of others workers could freely join together in co operative workshops an interest free bank would be set up to provide everyone with access to the means of production proudhon s ideas were influential within french working class movements and his followers were active in the revolution of one eight four eight in france proudhon s philosophy of property is complex it was developed in a number of works over his lifetime and there are differing interpretations of some of his ideas for more detailed discussion see here max stirner s egoism in his the ego and its own stirner argued that most commonly accepted social institutions including the notion of state property as a right natural rights in general and the very notion of society were mere illusions or ghosts in the mind saying of society that the individuals are its reality he advocated egoism and a form of amoralism in which individuals would unite in associations of egoists only when it was in their self interest to do so for him property simply comes about through might whoever knows how to take to defend the thing to him belongs property and what i have in my power that is my own so long as i assert myself as holder i am the proprietor of the thing stirner never called himself an anarchist he accepted only the label egoist nevertheless his ideas were influential on many individualistically inclined anarchists although interpretations of his thought are diverse american individualist anarchism benjamin tucker in one eight two five josiah warren had participated in a communitarian experiment headed by robert owen called new harmony which failed in a few years amidst much internal conflict warren blamed the community s failure on a lack of individual sovereignty and a lack of private property warren proceeded to organise experimenal anarchist communities which respected what he called the sovereignty of the individual at utopia and modern times in one eight three three warren wrote and published the peaceful revolutionist which some have noted to be the first anarchist periodical ever published benjamin tucker says that warren was the first man to expound and formulate the doctrine now known as anarchism liberty xiv december one nine zero zero one benjamin tucker became interested in anarchism through meeting josiah warren and william b greene he edited and published liberty from august one eight eight one to april one nine zero eight it is widely considered to be the finest individualist anarchist periodical ever issued in the english language tucker s conception of individualist anarchism incorporated the ideas of a variety of theorists greene s ideas on mutual banking warren s ideas on cost as the limit of price a heterodox variety of labour theory of value proudhon s market anarchism max stirner s egoism and herbert spencer s law of equal freedom tucker strongly supported the individual s right to own the product of his or her labour as private property and believed in a market economy for trading this property he argued that in a truly free market system without the state the abundance of competition would eliminate profits and ensure that all workers received the full value of their labor other one nine th century individualists included lysander spooner stephen pearl andrews and victor yarros the first international mikhail bakunin one eight one four one eight seven six in europe harsh reaction followed the revolutions of one eight four eight twenty years later in one eight six four the international workingmen s association sometimes called the first international united some diverse european revolutionary currents including anarchism due to its genuine links to active workers movements the international became signficiant from the start karl marx was a leading figure in the international he was elected to every succeeding general council of the association the first objections to marx came from the mutualists who opposed communism and statism shortly after mikhail bakunin and his followers joined in one eight six eight the first international became polarised into two camps with marx and bakunin as their respective figureheads the clearest difference between the camps was over strategy the anarchists around bakunin favoured in kropotkin s words direct economical struggle against capitalism without interfering in the political parliamentary agitation at that time marx and his followers focused on parliamentary activity bakunin characterised marx s ideas as authoritarian and predicted that if a marxist party gained to power its leaders would end up as bad as the ruling class they had fought against in one eight seven two the conflict climaxed with a final split between the two groups at the hague congress this is often cited as the origin of the conflict between anarchists and marxists from this moment the social democratic and libertarian currents of socialism had distinct organisations including rival internationals anarchist communism peter kropotkin proudhon and bakunin both opposed communism associating it with statism however in the one eight seven zero s many anarchists moved away from bakunin s economic thinking called collectivism and embraced communist concepts communists believed the means of production should be owned collectively and that goods be distributed by need not labor an early anarchist communist was joseph d jacque the first person to describe himself as libertarian unlike proudhon he argued that it is not the product of his or her labor that the worker has a right to but to the satisfaction of his or her needs whatever may be their nature he announced his ideas in his us published journal le libertaire one eight five eight one eight six one peter kropotkin often seen as the most important theorist outlined his economic ideas in the conquest of bread and fields factories and workshops he felt co operation is more beneficial than competition illustrated in nature in mutual aid a factor of evolution one eight nine seven subsequent anarchist communists include emma goldman and alexander berkman many in the anarcho syndicalist movements see below saw anarchist communism as their objective isaac puente s one nine three two comunismo libertario was adopted by the spanish cnt as its manifesto for a post revolutionary society some anarchists disliked merging communism with anarchism several individualist anarchists maintained that abolition of private property was not consistent with liberty for example benjamin tucker whilst professing respect for kropotkin and publishing his work described communist anarchism as pseudo anarchism propaganda of the deed johann most was an outspoken advocate of violence anarchists have often been portrayed as dangerous and violent due mainly to a number of high profile violent acts including riots assassinations insurrections and terrorism by some anarchists some revolutionaries of the late one nine th century encouraged acts of political violence such as bombings and the assassinations of heads of state to further anarchism such actions have sometimes been called propaganda by the deed one of the more outspoken advocates of this strategy was johann most who said the existing system will be quickest and most radically overthrown by the annihilation of its exponents therefore massacres of the enemies of the people must be set in motion most s preferred method of terrorism dynamite earned him the moniker dynamost however there is no consensus on the legitimacy or utility of violence in general mikhail bakunin and errico malatesta for example wrote of violence as a necessary and sometimes desirable force in revolutionary settings but at the same time they denounced acts of individual terrorism malatesta in on violence and bakunin when he refuted nechaev other anarchists sometimes identified as pacifist anarchists advocated complete nonviolence leo tolstoy whose philosophy is often viewed as a form of christian anarchism see below was a notable exponent of nonviolent resistance anarchism in the labour movement the red and black flag coming from the experience of anarchists in the labour movement is particularly associated with anarcho syndicalism anarcho syndicalism was an early two zero th century working class movement seeking to overthrow capitalism and the state to institute a worker controlled society the movement pursued industrial actions such as general strike as a primary strategy many anarcho syndicalists believed in anarchist communism though not all communists believed in syndicalism after the one eight seven one repression french anarchism reemerged influencing the bourses de travails of autonomous workers groups and trade unions from this movement the conf d ration g n rale du travail general confederation of work cgt was formed in one eight nine five as the first major anarcho syndicalist movement emile pataud and emile pouget s writing for the cgt saw libertarian communism developing from a general strike after one nine one four the cgt moved away from anarcho syndicalism due to the appeal of bolshevism french style syndicalism was a significant movement in europe prior to one nine two one and remained a significant movement in spain until the mid one nine four zero s the industrial workers of the world iww founded in one nine zero five in the us espoused unionism and sought a general strike to usher in a stateless society in one nine two three one zero zero zero zero zero members existed with the support of up to three zero zero zero zero zero though not explicitly anarchist they organized by rank and file democracy embodying a spirit of resistance that has inspired many anglophone syndicalists cnt propaganda from april two zero zero four reads don t let the politicians rule our lives you vote and they decide don t allow it unity action self management spanish anarchist trade union federations were formed in the one eight seven zero s one nine zero zero and one nine one zero the most successful was the confederaci n nacional del trabajo national confederation of labour cnt founded in one nine one zero prior to the one nine four zero s the cnt was the major force in spanish working class politics with a membership of one five eight million in one nine three four the cnt played a major role in the spanish civil war see also anarchism in spain syndicalists like ricardo flores mag n were key figures in the mexican revolution latin american anarchism was strongly influenced extending to the zapatista rebellion and the factory occupation movements in argentina in berlin in one nine two two the cnt was joined with the international workers association an anarcho syndicalist successor to the first international contemporary anarcho syndicalism continues as a minor force in many socities much smaller than in the one nine one zero s two zero s and three zero s the largest organised anarchist movement today is in spain in the form of the confederaci n general del trabajo and the cnt the cgt claims a paid up membership of six zero zero zero zero and received over a million votes in spanish syndical elections other active syndicalist movements include the us workers solidarity alliance and the uk solidarity federation the revolutionary industrial unionist industrial workers of the world also exists claiming two zero zero zero paid members contemporary critics of anarcho syndicalism and revolutionary industrial unionism claim that they are workerist and fail to deal with economic life outside work post leftist critics such as bob black claim anarcho syndicalism advocates oppressive social structures such as work and the workplace anarcho syndicalists in general uphold principles of workers solidarity direct action and self management the russian revolution the russian revolution of one nine one seven was a seismic event in the development of anarchism as a movement and as a philosophy anarchists participated alongside the bolsheviks in both february and october revolutions many anarchists initially supporting the bolshevik coup however the bolsheviks soon turned against the anarchists and other left wing opposition a conflict which culminated in the one nine one eight kronstadt rebellion anarchists in central russia were imprisoned or driven underground or joined the victorious bolsheviks in ukraine anarchists fought in the civil war against both whites and bolsheviks within the makhnovshchina peasant army led by nestor makhno expelled american anarchists emma goldman and alexander berkman before leaving russia were amongst those agitating in response to bolshevik policy and the suppression of the kronstadt uprising both wrote classic accounts of their experiences in russia aiming to expose the reality of bolshevik control for them bakunin s predictions about the consequences of marxist rule had proved all too true the victory of the bolsheviks in the october revolution and the resulting russian civil war did serious damage to anarchist movements internationally many workers and activists saw bolshevik success as setting an example communist parties grew at the expense of anarchism and other socialist movements in france and the us for example the major syndicalist movements of the cgt and iww began to realign themselves away from anarchism and towards the communist international in paris the dielo truda group of russian anarchist exiles which included nestor makhno concluded that anarchists needed to develop new forms of organisation in response to the structures of bolshevism their one nine two six manifesto known as the organisational platform of the libertarian communists was supported by some communist anarchists though opposed by many others the platform continues to inspire some contemporary anarchist groups who believe in an anarchist movement organised around its principles of theoretical unity tactical unity collective responsibility and federalism platformist groups today include the workers solidarity movement in ireland the uk s anarchist federation and the late north eastern federation of anarchist communists in the northeastern united states and bordering canada the fight against fascism spain one nine three six members of the cnt construct armoured cars to fight against the fascists in one of the collectivised factories in the one nine two zero s and one nine three zero s the familiar dynamics of anarchism s conflict with the state were transformed by the rise of fascism in europe in many cases european anarchists faced difficult choices should they join in popular fronts with reformist democrats and soviet led communists against a common fascist enemy luigi fabbri an exile from italian fascism was amongst those arguing that fascism was something different fascism is not just another form of government which like all others uses violence it is the most authoritarian and the most violent form of government imaginable it represents the utmost glorification of the theory and practice of the principle of authority in france where the fascists came close to insurrection in the february one nine three four riots anarchists divided over a united front policy in spain the cnt initially refused to join a popular front electoral alliance and abstention by cnt supporters led to a right wing election victory but in one nine three six the cnt changed its policy and anarchist votes helped bring the popular front back to power months later the ruling class responded with an attempted coup and the spanish civil war one nine three six three nine was underway in reponse to the army rebellion an anarchist inspired movement of peasants and workers supported by armed militias took control of the major city of barcelona and of large areas of rural spain where they collectivized the land but even before the eventual fascist victory in one nine three nine the anarchists were losing ground in a bitter struggle with the stalinists the cnt leadership often appeared confused and divided with some members controversially entering the government stalinist led troops suppressed the collectives and persecuted both dissident marxists and anarchists since the late one nine seven zero s anarchists have been involved in fighting the rise of neo fascist groups in germany and the united kingdom some anarchists worked within militant anti fascist groups alongside members of the marxist left they advocated directly combating fascists with physical force rather than relying on the state since the late one nine nine zero s a similar tendency has developed within us anarchism see also anti racist action us anti fascist action uk antifa religious anarchism leo tolstoy one eight two eight one nine one zero most anarchist culture tends to be secular if not outright anti religious however the combination of religious social conscience historical religiousity amongst oppressed social classes and the compatibility of some interpretations of religious traditions with anarchism has resulted in religious anarchism christian anarchists believe that there is no higher authority than god and oppose earthly authority such as government and established churches they believe that jesus teachings were clearly anarchistic but were corrupted when christianity was declared the official religion of rome christian anarchists who follow jesus directive to turn the other cheek are strict pacifists the most famous advocate of christian anarchism was leo tolstoy author of the kingdom of god is within you who called for a society based on compassion nonviolent principles and freedom christian anarchists tend to form experimental communities they also occasionally resist taxation many christian anarchists are vegetarian or vegan christian anarchy can be said to have roots as old as the religion s birth as the early church exhibits many anarchistic tendencies such as communal goods and wealth by aiming to obey utterly certain of the bible s teachings certain anabaptist groups of sixteenth century europe attempted to emulate the early church s social economic organisation and philosophy by regarding it as the only social structure capable of true obediance to jesus teachings and utterly rejected in theory all earthly hierarchies and authority and indeed non anabaptists in general and violence as ungodly such groups for example the hutterites typically went from initially anarchistic beginnings to as their movements stabalised more authoritarian social models chinese anarchism was most influential in the one nine two zero s strands of chinese anarchism included tai xu s buddhist anarchism which was influenced by tolstoy and the well field system neopaganism with its focus on the environment and equality along with its often decentralized nature has lead to a number of neopagan anarchists one of the most prominent is starhawk who writes extensively about both spirituality and activism anarchism and feminism emma goldman early french feminists such as jenny d h ricourt and juliette adam criticised the mysogyny in the anarchism of proudhon during the one eight five zero s anarcha feminism is a kind of radical feminism that espouses the belief that patriarchy is a fundamental problem in society while anarchist feminism has existed for more than a hundred years its explicit formulation as anarcha feminism dates back to the early seven zero s during the second wave feminist movement anarcha feminism views patriarchy as the first manifestation of hierarchy in human history thus the first form of oppression occurred in the dominance of male over female anarcha feminists then conclude that if feminists are against patriarchy they must also be against all forms of hierarchy and therefore must reject the authoritarian nature of the state and capitalism anarcho primitivists see the creation of gender roles and patriarchy a creation of the start of civilization and therefore consider primitivism to also be an anarchist school of thought that addresses feminist concerns eco feminism is often considered a feminist variant of green anarchist feminist thought anarcha feminism is most often associated with early two zero th century authors and theorists such as emma goldman and voltairine de cleyre although even early first wave feminist mary wollstonecraft held proto anarchist views and william godwin is often considered a feminist anarchist precursor it should be noted that goldman and de cleyre though they both opposed the state had opposing philosophies as de cleyre explains miss goldman is a communist i am an individualist she wishes to destroy the right of property i wish to assert it i make my war upon privilege and authority whereby the right of property the true right in that which is proper to the individual is annihilated she believes that co operation would entirely supplant competition i hold that competition in one form or another will always exist and that it is highly desirable it should in the spanish civil war an anarcha feminist group free women organized to defend both anarchist and feminist ideas in the modern day anarchist movement many anarchists male or female consider themselves feminists and anarcha feminist ideas are growing the publishing of quiet rumors an anarcha feminist reader has helped to spread various kinds of anti authoritarian and anarchist feminist ideas to the broader movement wendy mcelroy has popularized an individualist anarchism take on feminism in her books articles and individualist feminist website anarcho capitalism murray rothbard one nine two six one nine nine five anarcho capitalism is a predominantly united states based theoretical tradition that desires a stateless society with the economic system of free market capitalism unlike other branches of anarchism it does not oppose profit or capitalism consequently most anarchists do not recognise anarcho capitalism as a form of anarchism murray rothbard s synthesis of classical liberalism and austrian economics was germinal for the development of contemporary anarcho capitalist theory he defines anarcho capitalism in terms of the non aggression principle based on the concept of natural law competiting theorists use egoism utilitarianism used by david friedman or contractarianism used by jan narveson some minarchists such as ayn rand robert nozick and robert a heinlein have influenced anarcho capitalism some anarcho capitalists along with some right wing libertarian historians such as david hart and ralph raico considered similar philosophies existing prior to rothbard to be anarcho capitalist such as those of gustave de molinari and auberon herbert opponents of anarcho capitalists dispute these claims the place of anarcho capitalism within anarchism and indeed whether it is a form of anarchism at all is highly controversial for more on this debate see anarchism and anarcho capitalism anarchism and the environment since the late one nine seven zero s anarchists in anglophone and european countries have been taking action for the natural environment eco anarchists or green anarchists believe in deep ecology this is a worldview that embraces biodiversity and sustainability eco anarchists often use direct action against what they see as earth destroying institutions of particular importance is the earth first movement that takes action such as tree sitting another important component is ecofeminism which sees the domination of nature as a metaphor for the domination of women green anarchism also involves a critique of industrial capitalism and for some green anarchists civilization itself primitivism is a predominantly western philosophy that advocates a return to a pre industrial and usually pre agricultural society it develops a critique of industrial civilization in this critique technology and development have alienated people from the natural world this philosophy develops themes present in the political action of the luddites and the writings of jean jacques rousseau primitivism developed in the context of the reclaim the streets earth first and the earth liberation front movements john zerzan wrote that civilization not just the state would need to fall for anarchy to be achieved anarcho primitivists point to the anti authoritarian nature of many primitive or hunter gatherer societies throughout the world s history as examples of anarchist societies other branches and offshoots anarchism generates many eclectic and syncretic philosophies and movements since the western social formet in the one nine six zero s and one nine seven zero s a number new of movements and schools have appeared most of these stances are limited to even smaller numbers than the schools and movements listed above hakim bey post left anarchy post left anarchy also called egoist anarchism seeks to distance itself from the traditional left communists liberals social democrats etc and to escape the confines of ideology in general post leftists argue that anarchism has been weakened by its long attachment to contrary leftist movements and single issue causes anti war anti nuclear etc it calls for a synthesis of anarchist thought and a specifically anti authoritarian revolutionary movement outside of the leftist milieu it often focuses on the individual rather than speaking in terms of class or other broad generalizations and shuns organizational tendencies in favor of the complete absence of explicit hierarchy important groups and individuals associated with post left anarchy include crimethinc the magazine anarchy a journal of desire armed and its editor jason mcquinn bob black hakim bey and others for more information see infoshop org s anarchy after leftism section and the post left section on anarchism ws see also post left anarchy post structuralism the term postanarchism was originated by saul newman first receiving popular attention in his book from bakunin to lacan to refer to a theoretical move towards a synthesis of classical anarchist theory and poststructuralist thought subsequent to newman s use of the term however it has taken on a life of its own and a wide range of ideas including autonomism post left anarchy situationism post colonialism and zapatismo by its very nature post anarchism rejects the idea that it should be a coherent set of doctrines and beliefs as such it is difficult if not impossible to state with any degree of certainty who should or shouldn t be grouped under the rubric nonetheless key thinkers associated with post anarchism include saul newman todd may gilles deleuze and f lix guattari external reference postanarchism clearinghouse see also post anarchism insurrectionary anarchism insurrectionary anarchism is a form of revolutionary anarchism critical of formal anarchist labor unions and federations insurrectionary anarchists advocate informal organization including small affinity groups carrying out acts of resistance in various struggles and mass organizations called base structures which can include exploited individuals who are not anarchists proponents include wolfi landstreicher and alfredo m bonanno author of works including armed joy and the anarchist tension this tendency is represented in the us in magazines such as willful disobedience and killing king abacus see also insurrectionary anarchism small a anarchism small a anarchism is a term used in two different but not unconnected contexts dave neal posited the term in opposition to big a anarchism in the article anarchism ideology or methodology while big a anarchism referred to ideological anarchists small a anarchism was applied to their methodological counterparts those who viewed anarchism as a way of acting or a historical tendency against illegitimate authority as an anti ideological position small a anarchism shares some similarities with post left anarchy david graeber and andrej grubacic offer an alternative use of the term applying it to groups and movements organising according to or acting in a manner consistent with anarchist principles of decentralisation voluntary association mutual aid the network model and crucially the rejection of any idea that the end justifies the means let alone that the business of a revolutionary is to seize state power and then begin imposing one s vision at the point of a gun other issues conceptions of an anarchist society many political philosophers justify support of the state as a means of regulating violence so that the destruction caused by human conflict is minimized and fair relationships are established anarchists argue that pursuit of these ends does not justify the establishment of a state many argue that the state is incompatible with those goals and the cause of chaos violence and war anarchists argue that the state helps to create a monopoly on violence and uses violence to advance elite interests much effort has been dedicated to explaining how anarchist societies would handle criminality see also anarchism and society civil rights and cultural sovereignty black anarchism opposes the existence of a state capitalism and subjugation and domination of people of color and favors a non hierarchical organization of society theorists include ashanti alston lorenzo komboa ervin and sam mbah anarchist people of color was created as a forum for non caucasian anarchists to express their thoughts about racial issues within the anarchist movement particularly within the united states national anarchism is a political view which seeks to unite cultural or ethnic preservation with anarchist views its adherents propose that those preventing ethnic groups or races from living in separate autonomous groupings should be resisted anti racist action is not an anarchist group but many anarchists are involved it focuses on publicly confronting racist agitators the zapatista movement of chiapas mexico is a cultural sovereignty group with some anarchist proclivities neocolonialism and globalization nearly all anarchists oppose neocolonialism as an attempt to use economic coercion on a global scale carried out through state institutions such as the world bank world trade organization group of eight and the world economic forum globalization is an ambiguous term that has different meanings to different anarchist factions most anarchists use the term to mean neocolonialism and or cultural imperialism which they may see as related many are active in the anti globalization movement others particularly anarcho capitalists use globalization to mean the worldwide expansion of the division of labor and trade which they see as beneficial so long as governments do not intervene parallel structures many anarchists try to set up alternatives to state supported institutions and outposts such as food not bombs infoshops educational systems such as home schooling neighborhood mediation arbitration groups and so on the idea is to create the structures for a new anti authoritarian society in the shell of the old authoritarian one technology recent technological developments have made the anarchist cause both easier to advance and more conceivable to people many people use the internet to form on line communities intellectual property is undermined and a gift culture supported by sharing music files open source programming and the free software movement these cyber communities include the gnu linux indymedia and wiki some anarchists see information technology as the best weapon to defeat authoritarianism some even think the information age makes eventual anarchy inevitable see also crypto anarchism and cypherpunk pacifism some anarchists consider pacifism opposition to war to be inherent in their philosophy anarcho pacifists take it further and follow leo tolstoy s belief in non violence anarchists see war as an activity in which the state seeks to gain and consolidate power both domestically and in foreign lands and subscribe to randolph bourne s view that war is the health of the state a lot of anarchist activity has been anti war based parliamentarianism in general terms the anarchist ethos opposes voting in elections because voting amounts to condoning the state voluntaryism is an anarchist school of thought which emphasizes tending your own garden and neither ballots nor bullets the anarchist case against voting is explained in the ethics of voting by george h smith also see voting anarchists an oxymoron or what by joe peacott and writings by fred woodworth sectarianism most anarchist schools of thought are to some degree sectarian there is often a difference of opinion within each school about how to react to or interact with other schools some such as panarchists believe that it is possible for a variety of modes of social life to coexist and compete some anarchists view opposing schools as a social impossibility and resist interaction others see opportunities for coalition building or at least temporary alliances for specific purposes see anarchism without adjectives criticisms of anarchism main article criticisms of anarchism violence since anarchism has often been associated with violence and destruction some people have seen it as being too violent on the other hand hand frederick engels criticsed anarchists for not being violent enough a revolution is certainly the most authoritarian thing there is it is the act whereby one part of the population imposes its will upon the other part by means of rifles bayonets and cannon authoritarian means if such there be at all and if the victorious party does not want to have fought in vain it must maintain this rule by means of the terror which its arms inspire in the reactionists would the paris commune have lasted a single day if it had not made use of this authority of the armed people against the bourgeois utopianism anarchism is often criticised as unfeasible or plain utopian even by many who agree that it s a nice idea in principle for example carl landauer in his book european socialism criticizes anarchism as being unrealistically utopian and holds that government is a lesser evil than a society without repressive force he holds that the belief that ill intentions will cease if repressive force disappears is an absurdity however it must be noted that not all anarchists have such a utopian view of anarchism for example some such as benjamin tucker advocate privately funded institutions that defend individual liberty and property however other anarchists such as sir herbert read proudly accept the characterization utopian class character marxists have characterised anarchism as an expression of the class interests of the petite bourgeoisie or perhaps the lumpenproletariat see e g plekhanov for a marxist critique of one eight nine five anarchists have also been characterised as spoilt middle class dilettantes most recently in relation to anti capitalist protesters tacit authoritarianism in recent decades anarchism has been criticised by situationists post anarchists and others of preserving tacitly statist authoritarian or bureaucratic tendencies behind a dogmatic facade hypocrisy some critics point to the sexist and racist views of some prominent anarchists notably proudhon and bakunin as examples of hypocrisy inherent within anarchism while many anarchists however dismiss that the personal prejudices of one nine th century theorists influence the beliefs of present day anarchists others criticise modern anarchism for continuing to be eurocentric and reference the impact of anarchist thinkers like proudhon on fascism through groups like cercle proudhon anarcho capitalist bryan caplan argues that the treatment of fascists and suspected fascist sympathizers by spanish anarchists in the spanish civil war was a form of illegitimate coercion making the proffessed anarchists ultimately just a third faction of totalitarians alongside the communists and fascists he also criticizes the willingness of the cnt to join the statist republican government during the civil war and references stanley g payne s book on the franco regime which claims that the cnt entered negotiations with the fascist government six years after the war cultural phenomena noam chomsky one nine two eight the kind of anarchism that is most easily encountered in popular culture is represented by celebrities who publicly identify themselves as anarchists although some anarchists reject any focus on such famous living individuals as inherently litist the following figures are examples of prominent publicly self avowed anarchists the mit professor of linguistics noam chomsky the science fiction author ursula k le guin the social historian howard zinn entertainer and author hans alfredsson the avant garde artist nicol s rossell in denmark the freetown christiania was created in downtown copenhagen the housing and employment crisis in most of western europe led to the formation of communes and squatter movements like the one still thriving in barcelona in catalonia militant resistance to neo nazi groups in places like germany and the uprisings of autonomous marxism situationist and autonomist groups in france and italy also helped to give popularity to anti authoritarian non capitalist ideas in various musical styles anarchism rose in popularity most famous for the linking of anarchist ideas and music has been punk rock although in the modern age hip hop and folk music are also becoming important mediums for the spreading of the anarchist message in the uk this was associated with the punk rock movement the band crass is celebrated for its anarchist and pacifist ideas the dutch punk band the ex further exemplifies this expression for further details see anarcho punk see also there are many concepts relevant to the topic of anarchism this is a brief summary there is also a more extensive list of anarchist concepts individualist anarchism anarcho communism anarcho syndicalism anarcho capitalism mutualism christian anarchism anarcha feminism green anarchism nihilist anarchism anarcho nationalism black anarchism national anarchism post anarchism post left anarchism libertarian socialism anarchist symbolism list of anarchism links list of anarchists list of anarchist organizations major conflicts within anarchist thought past and present anarchist communities historical events paris commune one eight seven one haymarket riot one eight eight six the makhnovschina one nine one seven one nine two one kronstadt rebellion one nine two one spanish revolution one nine three six see anarchism in spain and spanish revolution may one nine six eight france one nine six eight wto meeting in seattle one nine nine nine books the following is a sample of books that have been referenced in this page a more complete list can be found at the list of anarchist books mikhail bakunin god and the state emma goldman anarchism other essays peter kropotkin mutual aid pierre joseph proudhon what is property rudolf rocker anarcho syndicalism murray rothbard the ethics of liberty max stirner the ego and its own leo tolstoy the kingdom of god is within you anarchism by region culture african anarchism anarchism in spain anarchism in the english tradition chinese anarchism references these notes have no corresponding reference in the article they might be re used against politics appleton boston anarchists yarros victor liberty vii january two one eight nine two noam chomsky on anarchism by noam chomsky external links the overwhelming diversity and number of links relating to anarchism is extensively covered on the links subpage anarchoblogs blogs by anarchists anarchy archives extensively archives information relating to famous anarchists this includes many of their books and other publications hundreds of anarchists are listed with short bios links dedicated pages at the daily bleed s anarchist encyclopedia infoshop org wikipedia page industrial workers of the world anarchism forms of government political ideology entry points political theories social philosophy autism is classified as a neurodevelopmental disorder that manifests itself in markedly abnormal social interaction communication ability patterns of interests and patterns of behavior although the specific etiology of autism is unknown many researchers suspect that autism results from genetically mediated vulnerabilities to environmental triggers and while there is disagreement about the magnitude nature and mechanisms for such environmental factors researchers have found at least seven major genes prevalent among individuals diagnosed as autistic some estimate that autism occurs in as many as one united states child in one six six however the national institute of mental health gives a more conservative estimate of one in one zero zero zero for families that already have one autistic child the odds of a second autistic child may be as high as one in twenty diagnosis is based on a list of psychiatric criteria and a series of standardized clinical tests may also be used autism may not be physiologically obvious a complete physical and neurological evaluation will typically be part of diagnosing autism some now speculate that autism is not a single condition but a group of several distinct conditions that manifest in similar ways by definition autism must manifest delays in social interaction language as used in social communication or symbolic or imaginative play with onset prior to age three years according to the diagnostic and statistical manual of mental disorders the icd one zero also says that symptoms must manifest before the age of three years there have been large increases in the reported incidence of autism for reasons that are heavily debated by researchers in psychology and related fields within the scientific community some children with autism have improved their social and other skills to the point where they can fully participate in mainstream education and social events but there are lingering concerns that an absolute cure from autism is impossible with current technology however many autistic children and adults who are able to communicate at least in writing are opposed to attempts to cure their conditions and see such conditions as part of who they are history dr hans asperger described a form of autism in the one nine four zero s that later became known as asperger s syndrome the word autism was first used in the english language by swiss psychiatrist eugene bleuler in a one nine one two number of the american journal of insanity it comes from the greek word for self however the classification of autism did not occur until the middle of the twentieth century when in one nine four three psychiatrist dr leo kanner of the johns hopkins hospital in baltimore reported on one one child patients with striking behavioral similarities and introduced the label early infantile autism he suggested autism from the greek autos meaning self to describe the fact that the children seemed to lack interest in other people although kanner s first paper on the subject was published in a now defunct journal the nervous child almost every characteristic he originally described is still regarded as typical of the autistic spectrum of disorders at the same time an austrian scientist dr hans asperger described a different form of autism that became known as asperger s syndrome but the widespread recognition of asperger s work was delayed by world war ii in germany and by the fact that his seminal paper wasn t translated into english for almost five zero years the majority of his work wasn t widely read until one nine nine seven thus these two conditions were described and are today listed in the diagnostic and statistical manual of mental disorders dsm iv tr fourth edition text revision one as two of the five pervasive developmental disorders pdd more often referred to today as autism spectrum disorders asd all of these conditions are characterized by varying degrees of difference in communication skills social interactions and restricted repetitive and stereotyped patterns of behavior few clinicians today solely use the dsm iv criteria for determining a diagnosis of autism which are based on the absence or delay of certain developmental milestones many clinicians instead use an alternate means or a combination thereof to more accurately determine a diagnosis terminology when referring to someone diagnosed with autism the term autistic is often used however the term person with autism can be used instead this is referred to as person first terminology the autistic community generally prefers the term autistic for reasons that are fairly controversial this article uses the term autistic see talk page characteristics dr leo kanner introduced the label early infantile autism in one nine four three there is a great diversity in the skills and behaviors of individuals diagnosed as autistic and physicians will often arrive at different conclusions about the appropriate diagnosis much of this is due to the sensory system of an autistic which is quite different from the sensory system of other people since certain stimulations can affect an autistic differently than a non autistic and the degree to which the sensory system is affected varies wildly from one autistic person to another nevertheless professionals within pediatric care and development often look for early indicators of autism in order to initiate treatment as early as possible however some people do not believe in treatment for autism either because they do not believe autism is a disorder or because they believe treatment can do more harm than good social development typically developing infants are social beings early in life they do such things as gaze at people turn toward voices grasp a finger and even smile in contrast most autistic children prefer objects to faces and seem to have tremendous difficulty learning to engage in the give and take of everyday human interaction even in the first few months of life many seem indifferent to other people because they avoid eye contact and do not interact with them as often as non autistic children children with autism often appear to prefer being alone to the company of others and may passively accept such things as hugs and cuddling without reciprocating or resist attention altogether later they seldom seek comfort from others or respond to parents displays of anger or affection in a typical way research has suggested that although autistic children are attached to their parents their expression of this attachment is unusual and difficult to interpret parents who looked forward to the joys of cuddling teaching and playing with their child may feel crushed by this lack of expected attachment behavior children with autism appear to lack theory of mind the ability to see things from another person s perspective a behavior cited as exclusive to human beings above the age of five and possibly other higher primates such as adult gorillas chimpanzees and bonobos typical five year olds can develop insights into other people s different knowledge feelings and intentions interpretations based upon social cues e g gestures facial expressions an individual with autism seems to lack these interpretation skills an inability that leaves them unable to predict or understand other people s actions the social alienation of autistic and asperger s people is so intense from childhood that many of them have imaginary friends as companionship however having an imaginary friend is not necessarily a sign of autism and also occurs in non autistic children although not universal it is common for autistic people to not regulate their behavior this can take the form of crying or verbal outbursts that may seem out of proportion to the situation individuals with autism generally prefer consistent routines and environments they may react negatively to changes in them it is not uncommon for these individuals to exhibit aggression increased levels of self stimulatory behavior self injury or extensive withdrawal in overwhelming situations sensory system a key indicator to clinicians making a proper assessment for autism would include looking for symptoms much like those found in sensory integration dysfunction children will exhibit problems coping with the normal sensory input indicators of this disorder include oversensitivity or underreactivity to touch movement sights or sounds physical clumsiness or carelessness poor body awareness a tendency to be easily distracted impulsive physical or verbal behavior an activity level that is unusually high or low not unwinding or calming oneself difficulty learning new movements difficulty in making transitions from one situation to another social and or emotional problems delays in speech language or motor skills specific learning difficulties delays in academic achievement one common example is an individual with autism hearing a person with autism may have trouble hearing certain people while other people are louder than usual or the person with autism may be unable to filter out sounds in certain situations such as in a large crowd of people see cocktail party effect however this is perhaps the part of the autism that tends to vary the most from person to person so these examples may not apply to every autistic it should be noted that sensory difficulties although reportedly common in autistics are not part of the dsm iv diagnostic criteria for autistic disorder communication difficulties by age three typical children have passed predictable language learning milestones one of the earliest is babbling by the first birthday a typical toddler says words turns when he or she hears his or her name points when he or she wants a toy and when offered something distasteful makes it clear that the answer is no speech development in people with autism takes different paths some remain mute throughout their lives while being fully literate and able to communicate in other ways images sign language and typing are far more natural to them some infants who later show signs of autism coo and babble during the first few months of life but stop soon afterwards others may be delayed developing language as late as the teenage years still inability to speak does not mean that people with autism are unintelligent or unaware once given appropriate accommodations many will happily converse for hours and can often be found in online chat rooms discussion boards or websites and even using communication devices at autism community social events such as autreat those who do speak often use language in unusual ways retaining features of earlier stages of language development for long periods or throughout their lives some speak only single words while others repeat the same phrase over and over some repeat what they hear a condition called echolalia sing song repetitions in particular are a calming joyous activity that many autistic adults engage in many people with autism have a strong tonal sense and can often understand spoken language some children may exhibit only slight delays in language or even seem to have precocious language and unusually large vocabularies but have great difficulty in sustaining typical conversations the give and take of non autistic conversation is hard for them although they often carry on a monologue on a favorite subject giving no one else an opportunity to comment when given the chance to converse with other autistics they comfortably do so in parallel monologue taking turns expressing views and information just as neurotypicals people without autism have trouble understanding autistic body languages vocal tones or phraseology people with autism similarly have trouble with such things in people without autism in particular autistic language abilities tend to be highly literal people without autism often inappropriately attribute hidden meaning to what people with autism say or expect the person with autism to sense such unstated meaning in their own words the body language of people with autism can be difficult for other people to understand facial expressions movements and gestures may be easily understood by some other people with autism but do not match those used by other people also their tone of voice has a much more subtle inflection in reflecting their feelings and the auditory system of a person without autism often cannot sense the fluctuations what seems to non autistic people like a high pitched sing song or flat robot like voice is common in autistic children some autistic children with relatively good language skills speak like little adults rather than communicating at their current age level which is one of the things that can lead to problems since non autistic people are often unfamiliar with the autistic body language and since autistic natural language may not tend towards speech autistic people often struggle to let other people know what they need as anybody might do in such a situation they may scream in frustration or resort to grabbing what they want while waiting for non autistic people to learn to communicate with them people with autism do whatever they can to get through to them communication difficulties may contribute to autistic people becoming socially anxious or depressed repetitive behaviors although people with autism usually appear physically normal and have good muscle control unusual repetitive motions known as self stimulation or stimming may set them apart these behaviors might be extreme and highly apparent or more subtle some children and older individuals spend a lot of time repeatedly flapping their arms or wiggling their toes others suddenly freeze in position as children they might spend hours lining up their cars and trains in a certain way not using them for pretend play if someone accidentally moves one of these toys the child may be tremendously upset autistic children often need and demand absolute consistency in their environment a slight change in any routine in mealtimes dressing taking a bath or going to school at a certain time and by the same route can be extremely disturbing people with autism sometimes have a persistent intense preoccupation for example the child might be obsessed with learning all about vacuum cleaners train schedules or lighthouses often they show great interest in different languages numbers symbols or science topics repetitive behaviors can also extend into the spoken word as well perseveration of a single word or phrase even for a specific number of times can also become a part of the child s daily routine effects in education children with autism are affected with these symptoms every day these unusual characteristics set them apart from the everyday normal student because they have trouble understanding people s thoughts and feelings they have trouble understanding what their teacher may be telling them they do not understand that facial expressions and vocal variations hold meanings and may misinterpret what emotion their instructor is displaying this inability to fully decipher the world around them makes education stressful teachers need to be aware of a student s disorder so that they are able to help the student get the best out of the lessons being taught some students learn better with visual aids as they are better able to understand material presented this way because of this many teachers create visual schedules for their autistic students this allows the student to know what is going on throughout the day so they know what to prepare for and what activity they will be doing next some autistic children have trouble going from one activity to the next so this visual schedule can help to reduce stress research has shown that working in pairs may be beneficial to autistic children autistic students have problems in schools not only with language and communication but with socialization as well they feel self conscious about themselves and many feel that they will always be outcasts by allowing them to work with peers they can make friends which in turn can help them cope with the problems that arise by doing so they can become more integrated into the mainstream environment of the classroom a teacher s aide can also be useful to the student the aide is able to give more elaborate directions that the teacher may not have time to explain to the autistic child the aide can also facilitate the autistic child in such a way as to allow them to stay at a similar level to the rest of the class this allows a partially one on one lesson structure so that the child is still able to stay in a normal classroom but be given the extra help that they need there are many different techniques that teachers can use to assist their students a teacher needs to become familiar with the child s disorder to know what will work best with that particular child every child is going to be different and teachers have to be able to adjust with every one of them students with autism spectrum disorders typically have high levels of anxiety and stress particularly in social environments like school if a student exhibits aggressive or explosive behavior it is important for educational teams to recognize the impact of stress and anxiety preparing students for new situations by writing social stories can lower anxiety teaching social and emotional concepts using systematic teaching approaches such as the incredible five point scale or other cognitive behavioral strategies can increase a student s ability to control excessive behavioral reactions dsm definition autism is defined in section two nine nine zero zero of the diagnostic and statistical manual of mental disorders dsm iv as a total of six or more items from one two and three with at least two from one and one each from two and three qualitative impairment in social interaction as manifested by at least two of the following marked impairment in the use of multiple nonverbal behaviors such as eye to eye gaze facial expression body postures and gestures to regulate social interaction failure to develop peer relationships appropriate to developmental level a lack of spontaneous seeking to share enjoyment interests or achievements with other people e g by a lack of showing bringing or pointing out objects of interest lack of social or Addd unicode text: Hello world, Καλημέρα κόσμε, コンニチハ \ No newline at end of file diff --git a/test/skipgram_test.py b/test/skipgram_test.py index 92c02cb..477ecd0 100644 --- a/test/skipgram_test.py +++ b/test/skipgram_test.py @@ -1,3 +1,6 @@ +# Set encoding to support Python 2 +# -*- coding: utf-8 -*- + import unittest from os import path @@ -22,7 +25,7 @@ def test_load_skipgram_model(self): self.assertEqual(model.dim, 100) self.assertEqual(model.ws, 5) self.assertEqual(model.epoch, 1) - self.assertEqual(model.min_count, 5) + self.assertEqual(model.min_count, 1) self.assertEqual(model.neg, 5) self.assertEqual(model.loss_name, 'ns') self.assertEqual(model.bucket, 2000000) @@ -34,13 +37,18 @@ def test_load_skipgram_model(self): # Make sure the vector have the right dimension self.assertEqual(len(model.get_vector('the')), model.dim) + # Make sure we support unicode character + exists = u'Καλημέρα' in model.words + self.assertTrue(exists) + self.assertEqual(len(model.get_vector(u'Καλημέρα')), model.dim) + def test_create_skipgram_model(self): # set params lr=0.005 dim=10 ws=5 epoch=5 - min_count=5 + min_count=1 neg=5 word_ngrams=1 loss='ns' @@ -77,5 +85,10 @@ def test_create_skipgram_model(self): # Make sure the vector have the right dimension self.assertEqual(len(model.get_vector('the')), dim) + # Make sure we support unicode character + exists = u'Καλημέρα' in model.words + self.assertTrue(exists) + self.assertEqual(len(model.get_vector(u'Καλημέρα')), model.dim) + if __name__ == '__main__': unittest.main() From 1ab3592157195201d23efb88d3241ecdf868d6ec Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 12 Aug 2016 08:37:20 +0700 Subject: [PATCH 004/109] Migrate to salestock --- README.md | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 85f1327..56e54bd 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# fasttext [![Build Status](https://travis-ci.org/pyk/fastText.py.svg?branch=master)](https://travis-ci.org/pyk/fastText.py) +# fasttext [![Build Status](https://travis-ci.org/salestock/fastText.py.svg?branch=master)](https://travis-ci.org/salestock/fastText.py) fasttext is a Python interface for [Facebook fastText](https://github.com/facebookresearch/fastText). diff --git a/setup.py b/setup.py index c2a4856..8336234 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ # Package details setup( name='fasttext', - version='0.5.17', + version='0.5.18', author='Bayu Aldi Yansyah', author_email='bayualdiyansyah@gmail.com', url='https://github.com/pyk/fastText.py', From 86db92999dce212875e20de088db7b2e20b9f859 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 12 Aug 2016 08:51:35 +0700 Subject: [PATCH 005/109] Add unicode test to CBOW model --- Makefile | 2 +- test/cbow_test.py | 23 ++++++++++++++++++----- test/skipgram_test.py | 11 ++++++----- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index d25ed7e..5f3cdaa 100644 --- a/Makefile +++ b/Makefile @@ -33,7 +33,7 @@ test-skipgram: fasttext/cpp/fasttext test/skipgram_params_test.bin test/cbow_params_test.bin: ./fasttext/cpp/fasttext cbow -input test/params_test.txt -output \ test/cbow_params_test -lr 0.005 -dim 50 -ws 5 -epoch 1 \ - -minCount 3 -neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 \ + -minCount 1 -neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 \ -thread 4 -lrUpdateRate 100 -t 1e-4 test-cbow: fasttext/cpp/fasttext test/cbow_params_test.bin diff --git a/test/cbow_test.py b/test/cbow_test.py index a10d36d..b884697 100644 --- a/test/cbow_test.py +++ b/test/cbow_test.py @@ -1,3 +1,5 @@ +# Set encoding to support Python 2 +# -*- coding: utf-8 -*- import unittest from os import path @@ -8,9 +10,8 @@ input_file = path.join(path.dirname(__file__), 'params_test.txt') output = path.join(path.dirname(__file__), 'generated_cbow') -# Test to make sure that binary file generated by fasttext(1) is -# loaded properly -class TestLoadModel(unittest.TestCase): +# Test to make sure that cbow interface run correctly +class TestCBOWModel(unittest.TestCase): def test_load_cbow_model(self): model = ft.load_model(cbow_file) @@ -22,7 +23,7 @@ def test_load_cbow_model(self): self.assertEqual(model.dim, 50) self.assertEqual(model.ws, 5) self.assertEqual(model.epoch, 1) - self.assertEqual(model.min_count, 3) + self.assertEqual(model.min_count, 1) self.assertEqual(model.neg, 5) self.assertEqual(model.loss_name, 'ns') self.assertEqual(model.bucket, 2000000) @@ -34,13 +35,19 @@ def test_load_cbow_model(self): # Make sure the vector have the right dimension self.assertEqual(len(model.get_vector('the')), model.dim) + # Make sure we support unicode character + words = model.words + exists = u'Καλημέρα' in words + self.assertTrue(exists) + self.assertEqual(len(model.get_vector(u'Καλημέρα')), model.dim) + def test_create_cbow_model(self): # set params lr=0.005 dim=10 ws=5 epoch=5 - min_count=5 + min_count=1 neg=5 word_ngrams=1 loss='ns' @@ -77,5 +84,11 @@ def test_create_cbow_model(self): # Make sure the vector have the right dimension self.assertEqual(len(model.get_vector('the')), dim) + # Make sure we support unicode character + words = model.words + exists = u'Καλημέρα' in words + self.assertTrue(exists) + self.assertEqual(len(model.get_vector(u'Καλημέρα')), model.dim) + if __name__ == '__main__': unittest.main() diff --git a/test/skipgram_test.py b/test/skipgram_test.py index 477ecd0..a86964b 100644 --- a/test/skipgram_test.py +++ b/test/skipgram_test.py @@ -11,9 +11,8 @@ input_file = path.join(path.dirname(__file__), 'params_test.txt') output = path.join(path.dirname(__file__), 'generated_skipgram') -# Test to make sure that binary file generated by fasttext(1) is -# loaded properly -class TestLoadModel(unittest.TestCase): +# Test to make sure that skipgram interface run correctly +class TestSkipgramModel(unittest.TestCase): def test_load_skipgram_model(self): model = ft.load_model(skipgram_file) @@ -38,7 +37,8 @@ def test_load_skipgram_model(self): self.assertEqual(len(model.get_vector('the')), model.dim) # Make sure we support unicode character - exists = u'Καλημέρα' in model.words + words = model.words + exists = u'Καλημέρα' in words self.assertTrue(exists) self.assertEqual(len(model.get_vector(u'Καλημέρα')), model.dim) @@ -86,7 +86,8 @@ def test_create_skipgram_model(self): self.assertEqual(len(model.get_vector('the')), dim) # Make sure we support unicode character - exists = u'Καλημέρα' in model.words + words = model.words + exists = u'Καλημέρα' in words self.assertTrue(exists) self.assertEqual(len(model.get_vector(u'Καλημέρα')), model.dim) From a96ef349157a5708ef05354014e3771be9ce2949 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 12 Aug 2016 09:42:20 +0700 Subject: [PATCH 006/109] Fix unicode literals SyntaxError in Python 3.2 --- test/cbow_test.py | 16 +++++++--------- test/skipgram_test.py | 17 +++++++---------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/test/cbow_test.py b/test/cbow_test.py index b884697..0746d36 100644 --- a/test/cbow_test.py +++ b/test/cbow_test.py @@ -1,7 +1,7 @@ # Set encoding to support Python 2 # -*- coding: utf-8 -*- +from __future__ import unicode_literals import unittest - from os import path import fasttext as ft @@ -36,10 +36,9 @@ def test_load_cbow_model(self): self.assertEqual(len(model.get_vector('the')), model.dim) # Make sure we support unicode character - words = model.words - exists = u'Καλημέρα' in words - self.assertTrue(exists) - self.assertEqual(len(model.get_vector(u'Καλημέρα')), model.dim) + unicode_str = 'Καλημέρα' + self.assertTrue(unicode_str in model.words) + self.assertEqual(len(model.get_vector(unicode_str)), model.dim) def test_create_cbow_model(self): # set params @@ -85,10 +84,9 @@ def test_create_cbow_model(self): self.assertEqual(len(model.get_vector('the')), dim) # Make sure we support unicode character - words = model.words - exists = u'Καλημέρα' in words - self.assertTrue(exists) - self.assertEqual(len(model.get_vector(u'Καλημέρα')), model.dim) + unicode_str = 'Καλημέρα' + self.assertTrue(unicode_str in model.words) + self.assertEqual(len(model.get_vector(unicode_str)), model.dim) if __name__ == '__main__': unittest.main() diff --git a/test/skipgram_test.py b/test/skipgram_test.py index a86964b..8d32056 100644 --- a/test/skipgram_test.py +++ b/test/skipgram_test.py @@ -1,8 +1,7 @@ # Set encoding to support Python 2 # -*- coding: utf-8 -*- - +from __future__ import unicode_literals import unittest - from os import path import fasttext as ft @@ -37,10 +36,9 @@ def test_load_skipgram_model(self): self.assertEqual(len(model.get_vector('the')), model.dim) # Make sure we support unicode character - words = model.words - exists = u'Καλημέρα' in words - self.assertTrue(exists) - self.assertEqual(len(model.get_vector(u'Καλημέρα')), model.dim) + unicode_str = 'Καλημέρα' + self.assertTrue(unicode_str in model.words) + self.assertEqual(len(model.get_vector(unicode_str)), model.dim) def test_create_skipgram_model(self): # set params @@ -86,10 +84,9 @@ def test_create_skipgram_model(self): self.assertEqual(len(model.get_vector('the')), dim) # Make sure we support unicode character - words = model.words - exists = u'Καλημέρα' in words - self.assertTrue(exists) - self.assertEqual(len(model.get_vector(u'Καλημέρα')), model.dim) + unicode_str = 'Καλημέρα' + self.assertTrue(unicode_str in model.words) + self.assertEqual(len(model.get_vector(unicode_str)), model.dim) if __name__ == '__main__': unittest.main() From c791a00cd16cf99e7955f7d4e22ea96064e32eb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Benesty?= Date: Fri, 12 Aug 2016 11:03:51 +0200 Subject: [PATCH 007/109] resolve #15, resolve #16 Add contains and getitem functions (#17) * Add contains and getitem functions * change version * Update doc and tests * Update doc and tests * Reintroduce getvector function (compatibility) --- .gitignore | 6 ++++++ README.md | 12 +++++++----- fasttext/model.py | 16 +++++++++++----- setup.py | 2 +- test/cbow_test.py | 7 +++++-- test/skipgram_test.py | 7 +++++-- 6 files changed, 35 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index ddb7514..85b5201 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,9 @@ dist/ fasttext/fasttext.cpp facebookresearch-fasttext-* + +# Intellij +.idea/ + +# pip +.eggs/ \ No newline at end of file diff --git a/README.md b/README.md index 56e54bd..35e1556 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,9 @@ The previously trained model can be used to compute word vectors for out-of-vocabulary words. ```python -print model.get_vector('king') # get the vector of the word 'king' +print model.get_vector('king') +# or just use a nice syntax +print model['king'] # get the vector of the word 'king' ``` the following `fasttext(1)` command is equivalent: @@ -89,7 +91,7 @@ We can use `fasttext.load_model` to load pre-trained model: ```python model = fasttext.load_model('model.bin') print model.words # list of words in dictionary -print model.get_vector('king') # get the vector of the word 'king' +print model['king'] # get the vector of the word 'king' ``` ### Text classification @@ -106,15 +108,15 @@ import fasttext model = fasttext.skipgram(params) model.words -model.get_vector(word) +model[word] model = fasttext.cbow(params) model.words -model.get_vector(word) +model[word] model = fasttext.load_model('model.bin') model.words -model.get_vector(word) +model[word] ``` List of params and their default value: diff --git a/fasttext/model.py b/fasttext/model.py index 4c07f74..2746221 100644 --- a/fasttext/model.py +++ b/fasttext/model.py @@ -2,6 +2,7 @@ import numpy as np from numpy.linalg import norm + class WordVectorModel(object): def __init__(self, model, words): self._model = model @@ -23,10 +24,15 @@ def __init__(self, model, words): def get_vector(self, word): return self._model.get_vector(word) + def __getitem__(self, word): + return self._model.get_vector(word) + + def __contains__(self, word): + return word in self.words + def cosine_similarity(self, first_word, second_word): - v1 = self.get_vector(first_word) - v2 = self.get_vector(second_word) - dot_product = np.dot(v1,v2) - cosine_sim = dot_product/(norm(v1)*norm(v2)) + v1 = self.__getitem__(first_word) + v2 = self.__getitem__(second_word) + dot_product = np.dot(v1, v2) + cosine_sim = dot_product / (norm(v1) * norm(v2)) return cosine_sim - diff --git a/setup.py b/setup.py index 8336234..90a09fe 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ # Package details setup( name='fasttext', - version='0.5.18', + version='0.5.19', author='Bayu Aldi Yansyah', author_email='bayualdiyansyah@gmail.com', url='https://github.com/pyk/fastText.py', diff --git a/test/cbow_test.py b/test/cbow_test.py index 0746d36..114f003 100644 --- a/test/cbow_test.py +++ b/test/cbow_test.py @@ -33,12 +33,12 @@ def test_load_cbow_model(self): self.assertEqual(model.t, 1e-4) # Make sure the vector have the right dimension - self.assertEqual(len(model.get_vector('the')), model.dim) + self.assertEqual(len(model['the']), model.dim) # Make sure we support unicode character unicode_str = 'Καλημέρα' self.assertTrue(unicode_str in model.words) - self.assertEqual(len(model.get_vector(unicode_str)), model.dim) + self.assertEqual(len(model[unicode_str]), model.dim) def test_create_cbow_model(self): # set params @@ -81,11 +81,14 @@ def test_create_cbow_model(self): self.assertTrue(path.isfile(output + '.vec')) # Make sure the vector have the right dimension + self.assertEqual(len(model['the']), dim) self.assertEqual(len(model.get_vector('the')), dim) # Make sure we support unicode character unicode_str = 'Καλημέρα' self.assertTrue(unicode_str in model.words) + self.assertTrue(unicode_str in model) + self.assertEqual(len(model[unicode_str]), model.dim) self.assertEqual(len(model.get_vector(unicode_str)), model.dim) if __name__ == '__main__': diff --git a/test/skipgram_test.py b/test/skipgram_test.py index 8d32056..fe9219d 100644 --- a/test/skipgram_test.py +++ b/test/skipgram_test.py @@ -33,12 +33,12 @@ def test_load_skipgram_model(self): self.assertEqual(model.t, 1e-4) # Make sure the vector have the right dimension - self.assertEqual(len(model.get_vector('the')), model.dim) + self.assertEqual(len(model['the']), model.dim) # Make sure we support unicode character unicode_str = 'Καλημέρα' self.assertTrue(unicode_str in model.words) - self.assertEqual(len(model.get_vector(unicode_str)), model.dim) + self.assertEqual(len(model[unicode_str]), model.dim) def test_create_skipgram_model(self): # set params @@ -81,11 +81,14 @@ def test_create_skipgram_model(self): self.assertTrue(path.isfile(output + '.vec')) # Make sure the vector have the right dimension + self.assertEqual(len(model['the']), dim) self.assertEqual(len(model.get_vector('the')), dim) # Make sure we support unicode character unicode_str = 'Καλημέρα' self.assertTrue(unicode_str in model.words) + self.assertTrue(unicode_str in model) + self.assertEqual(len(model[unicode_str]), model.dim) self.assertEqual(len(model.get_vector(unicode_str)), model.dim) if __name__ == '__main__': From 3e340f0e5ffe35b78f9918079d462a1e2ae5317f Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sat, 13 Aug 2016 03:40:20 +0700 Subject: [PATCH 008/109] Update the intro and the API docs --- README.md | 103 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 94 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 35e1556..1731a69 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# fasttext [![Build Status](https://travis-ci.org/salestock/fastText.py.svg?branch=master)](https://travis-ci.org/salestock/fastText.py) +# fasttext [![Build Status](https://travis-ci.org/salestock/fastText.py.svg?branch=master)](https://travis-ci.org/salestock/fastText.py) [![PyPI version](https://badge.fury.io/py/fasttext.svg)](https://badge.fury.io/py/fasttext) fasttext is a Python interface for [Facebook fastText](https://github.com/facebookresearch/fastText). @@ -96,8 +96,43 @@ print model['king'] # get the vector of the word 'king' ### Text classification -_Works in progress_ +This package can also be used to train supervised text classifiers and load +pre-trained classifier from fastText. +In order to train a text classifier using the method described in +[2](#bag-of-tricks-for-efficient-text-classification), we can use +the following function: + +```python +classifier = fasttext.supervised('data.train.txt', 'model') +``` + +equivalent as `fasttext(1)` command: + +```shell +./fasttext supervised -input data.train.txt -output model +``` + +where `data.train.txt` is a text file containing a training sentence per line +along with the labels. By default, we assume that labels are words +that are prefixed by the string `__label__`. + +We can specify the label prefix with the `label_prefix` param: + +```python +classifier = fasttext.supervised('data.train.txt', 'model', + label_prefix='__label__') +``` + +equivalent as `fasttext(1)` command: + +```shell +./fasttext supervised -input data.train.txt -output model -label '__label__' +``` + +This will output two files: `model.bin` and `model.vec`. + +TODO: add test and predict ## API documentation @@ -106,20 +141,67 @@ _Works in progress_ ```python import fasttext +# Skipgram model +# Train & load skipgram model model = fasttext.skipgram(params) -model.words -model[word] +# CBOW model +# Train & load CBOW model model = fasttext.cbow(params) -model.words -model[word] +# Load pre-trained model +# File .bin that previously trained or generated by fastText +# can be loaded using this function model = fasttext.load_model('model.bin') -model.words -model[word] + +# Skipgram and CBOW model have the following atributes & methods +model.model_name # Model name +model.words # List of words in the dictionary +model.dim # Size of word vector +model.ws # Size of context window +model.epoch # Number of epochs +model.min_count # Minimal number of word occurences +model.neg # Number of negative sampled +model.word_ngrams # Max length of word ngram +model.loss_name # Loss function name +model.bucket # Number of buckets +model.minn # Min length of char ngram +model.maxn # Max length of char ngram +model.lr_update_rate # Rate of updates for the learning rate +model.t # Value of sampling threshold +model.get_vector(word) # Get the vector of specified word +model[word] # Get the vector of specified word + + +# Supervised model +# Train & load the classifier +classifier = fasttext.supervised(params) + +# Load pre-trained classifier +# File .bin that previously trained or generated by fastText +# can be loaded using this function +# label_prefix is optional +classifier = fasttext.load_model('classifier.bin', label_prefix='some_prefix') + +# Classifier have the following atributes & methods +classifier.labels # List of labels +classifier.dim # Size of word vector +classifier.ws # Size of context window +classifier.epoch # Number of epochs +classifier.min_count # Minimal number of word occurences +classifier.neg # Number of negative sampled +classifier.word_ngrams # Max length of word ngram +classifier.loss_name # Loss function name +classifier.bucket # Number of buckets +classifier.minn # Min length of char ngram +classifier.maxn # Max length of char ngram +classifier.lr_update_rate # Rate of updates for the learning rate +classifier.t # Value of sampling threshold + +TODO: add classifier method here ``` -List of params and their default value: +List of available `params` and their default value: ``` input training file path @@ -139,6 +221,9 @@ maxn max length of char ngram [6] thread number of threads [12] t sampling threshold [0.0001] silent disable the log output from the C++ extension [1] + +# Supervised model only +label_prefix Prefix of the label name [__label__] ``` ## References From 5d4780aa4b7ed6aa98cdcf29da2ec256b0d2361c Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sat, 13 Aug 2016 03:41:02 +0700 Subject: [PATCH 009/109] Add slack notification in Travis CI --- .travis.yml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1643605..cc3887a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,12 +1,13 @@ language: python python: - - "2.6" - - "2.7" - - "3.2" - - "3.3" - - "3.4" - - "3.5" - +- '2.6' +- '2.7' +- '3.2' +- '3.3' +- '3.4' +- '3.5' install: make install - script: make test +notifications: + slack: + secure: pStiYmzBbnb0W18r1i1Lz8FIakVHajsv3on1vWy8fNWVuPfaJ85ZqJhnmrr2HKlanZcl6bEMnllDctzt/F+u4HfeXHmhS7a9nZbRDDGyWIxsvJA/UqPt2byLEB1u+KbLb53eDu7MTIe63tzk1zq+4BTupI+btc4igiUuzAhqh4+LP9eZe2L58aC+jOzIn/9Kno7+xawhj2DKs6m3O/hcXFORcOpdtWRFpoDa66dN7xPVbN0hYD80uVApEpghnHToiJN0HhhB92YmZHa1ByWj7u9VN1Eaex1srGQOJQG3FaDBJY1r2e9c7Sj+33gkZb1AqjeOpxhRsxxVUdigDvCoxIrr6ll0/p3n6pUfRGQ7SB1A7NoRBC+g6aTJbOLr5NjQDBmZHaFXx/QFd1h0EUfgBybDI3v4cKOtV8vIFoT1xdkGs/Hjo4v9z4KO6R135uDBwaJAo9cWx360xV1UK1cb4kfzdbJFk4mNmMEbdwJHT27a7e3uWr1lu6CrMUzVk0EXj1BroKC7jcRK7qthr9DcfW2mmGG3JTIKQ6+nYSEF0KC/JjjbIsg/2hKtq7mACzrHrluN6HbqCF6Kd2n2rfItsqIaCo6LEmgZ2fo69R34i96QzyHpplBivWOgC+pwLOe0FiseuleSCZ/kQgJPf62gsqCan6+GkazoEp9Ow+lPMkA= From 5957c4e26fb6da89e489f18fd17b7249b153287e Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sat, 13 Aug 2016 03:44:47 +0700 Subject: [PATCH 010/109] Add interface for supervised model --- fasttext/__init__.py | 1 + fasttext/fasttext.pyx | 71 ++++++++++++++++++++++++++++-------------- fasttext/interface.pxd | 4 +++ fasttext/model.py | 18 +++++++++++ 4 files changed, 71 insertions(+), 23 deletions(-) diff --git a/fasttext/__init__.py b/fasttext/__init__.py index 78082a6..7804d21 100644 --- a/fasttext/__init__.py +++ b/fasttext/__init__.py @@ -1,3 +1,4 @@ from .fasttext import skipgram from .fasttext import cbow from .fasttext import load_model +from .fasttext import supervised diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index d40f3a6..d221746 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -12,6 +12,7 @@ from libcpp.string cimport string # Python module import os from model import WordVectorModel +from model import SupervisedModel from builtins import bytes # This class wrap C++ class FastTextModel, so it can be accessed via Python @@ -84,8 +85,10 @@ cdef class FastTextModelWrapper: def t(self): return self.fm.t -# load_model: load a word vector model -def load_model(filename): +# Load .bin file that generated by fastText +# label_prefix is an optional argument to load the supervised model +# prefix will be removed from the label name and stored in the model.labels +def load_model(filename, label_prefix=''): # Check if the filename is readable if not os.path.isfile(filename): raise ValueError('fastText: trained model cannot be opened!') @@ -93,23 +96,30 @@ def load_model(filename): model = FastTextModelWrapper() filename_bytes = bytes(filename, 'utf-8') loadModelWrapper(filename_bytes, model.fm) - dictionary = model.fm.getDictionary() - cdef string word - words = [] - for i in xrange(dictionary.nwords()): - word = dictionary.getWord(i) - words.append(word.decode('utf-8')) - # TODO: handle supervised here model_name = model.fm.modelName + dictionary = model.fm.getDictionary() + cdef string cpp_string if model_name == 'skipgram' or model_name == 'cbow': + words = [] + for i in xrange(dictionary.nwords()): + cpp_string = dictionary.getWord(i) + words.append(cpp_string.decode('utf-8')) return WordVectorModel(model, words) + elif model_name == 'supervised': + labels = [] + for i in xrange(dictionary.nlabels()): + cpp_string = dictionary.getLabel(i) + label = cpp_string.decode('utf-8') + # Remove the prefix + labels.append(label.replace(label_prefix, '')) + return SupervisedModel(model, labels) else: - raise ValueError('fastText: model name not exists!') + raise ValueError('fastText: model name is not valid!') -# Base function to learn word representation -def _wordvector_model(model_name, input_file, output, lr, dim, ws, epoch, - min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, +# Wrapper for train(int argc, char *argv) C++ function in cpp/src/fasttext.cc +def train_wrapper(model_name, input_file, output, label_prefix, lr, dim, ws, + epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate, t, silent=1): # Check if the input_file is valid @@ -119,8 +129,8 @@ def _wordvector_model(model_name, input_file, output, lr, dim, ws, epoch, # Check if the output is writeable try: f = open(output, 'w') - os.remove(output) f.close() + os.remove(output) except IOError: raise IOError('fastText: output is not writeable!') @@ -135,6 +145,11 @@ def _wordvector_model(model_name, input_file, output, lr, dim, ws, epoch, values = [input_file, output, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate, t] + # Add -label params for supervised model + if model_name == 'supervised': + py_args.append(b'-label') + values.append(label_prefix) + for arg, value in zip(py_args, values): py_argv.append(arg) py_argv.append(bytes(str(value), 'utf-8')) @@ -151,7 +166,7 @@ def _wordvector_model(model_name, input_file, output, lr, dim, ws, epoch, # Load the model output_bin = output + '.bin' - model = load_model(output_bin) + model = load_model(output_bin, label_prefix) # Free the log & sigmoid tables from the heap utils.freeTables() @@ -166,15 +181,25 @@ def _wordvector_model(model_name, input_file, output, lr, dim, ws, epoch, def skipgram(input_file, output, lr=0.05, dim=100, ws=5, epoch=5, min_count=5, neg=5, word_ngrams=1, loss='ns', bucket=2000000, minn=3, maxn=6, thread=12, lr_update_rate=100, t=1e-4, silent=1): - return _wordvector_model('skipgram', input_file, output, lr, - dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, - maxn, thread, lr_update_rate, t, silent) + label_prefix = '' + return train_wrapper('skipgram', input_file, output, label_prefix, lr, + dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, + maxn, thread, lr_update_rate, t, silent) -# Learn word representation using cbow model +# Learn word representation using CBOW model def cbow(input_file, output, lr=0.05, dim=100, ws=5, epoch=5, min_count=5, neg=5, word_ngrams=1, loss='ns', bucket=2000000, minn=3, maxn=6, thread=12, lr_update_rate=100, t=1e-4, silent=1): - return _wordvector_model('cbow', input_file, output, lr, - dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, - maxn, thread, lr_update_rate, t, silent) - + label_prefix = '' + return train_wrapper('cbow', input_file, output, label_prefix, lr, dim, + ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, + thread, lr_update_rate, t, silent) + +# Train classifier +def supervised(input_file, output, label_prefix='__label__', lr=0.05, dim=100, + ws=5, epoch=5, min_count=5, neg=5, word_ngrams=1, loss='ns', + bucket=2000000, minn=3, maxn=6, thread=12, lr_update_rate=100, + t=1e-4, silent=1): + return train_wrapper('supervised', input_file, output, label_prefix, lr, + dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, + maxn, thread, lr_update_rate, t, silent) diff --git a/fasttext/interface.pxd b/fasttext/interface.pxd index b2fa57c..19d5119 100644 --- a/fasttext/interface.pxd +++ b/fasttext/interface.pxd @@ -9,8 +9,12 @@ cdef extern from "cpp/src/real.h": cdef extern from "cpp/src/dictionary.h": cdef cppclass Dictionary: Dictionary() + int32_t nwords() + int32_t nlabels() + string getWord(int32_t) + string getLabel(int32_t) cdef extern from "interface.h": cdef cppclass FastTextModel: diff --git a/fasttext/model.py b/fasttext/model.py index 2746221..bac1e17 100644 --- a/fasttext/model.py +++ b/fasttext/model.py @@ -36,3 +36,21 @@ def cosine_similarity(self, first_word, second_word): dot_product = np.dot(v1, v2) cosine_sim = dot_product / (norm(v1) * norm(v2)) return cosine_sim + +class SupervisedModel(object): + def __init__(self, model, labels): + self._model = model + self.labels = labels + self.dim = model.dim; + self.ws = model.ws; + self.epoch = model.epoch; + self.min_count = model.minCount; + self.neg = model.neg; + self.word_ngrams = model.wordNgrams; + self.loss_name = model.lossName.decode('utf-8'); + self.model_name = model.modelName.decode('utf-8'); + self.bucket = model.bucket; + self.minn = model.minn; + self.maxn = model.maxn; + self.lr_update_rate = model.lrUpdateRate; + self.t = model.t; From e3436fff72916a1871362f9ab13fec99013438a9 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sat, 13 Aug 2016 03:45:47 +0700 Subject: [PATCH 011/109] Add tests for the supervised model --- Makefile | 10 +++- test/cbow_test.py | 3 + test/skipgram_test.py | 3 + test/supervised_params_test.txt | 100 ++++++++++++++++++++++++++++++++ test/supervised_test.py | 89 ++++++++++++++++++++++++++++ 5 files changed, 204 insertions(+), 1 deletion(-) create mode 100644 test/supervised_params_test.txt create mode 100644 test/supervised_test.py diff --git a/Makefile b/Makefile index 5f3cdaa..8a76cbe 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: install test -test: test-skipgram test-cbow +test: test-skipgram test-cbow test-supervised buildext: python setup.py build_ext --inplace @@ -39,3 +39,11 @@ test/cbow_params_test.bin: test-cbow: fasttext/cpp/fasttext test/cbow_params_test.bin python test/cbow_test.py --verbose +# Test for classifier +test/supervised_params_test.bin: + ./fasttext/cpp/fasttext supervised -input test/supervised_params_test.txt \ + -output test/supervised_params_test -dim 10 -lr 0.1 -wordNgrams 2 \ + -minCount 1 -bucket 2000000 -epoch 5 -thread 4 + +test-supervised: fasttext/cpp/fasttext test/supervised_params_test.bin + python test/supervised_test.py --verbose diff --git a/test/cbow_test.py b/test/cbow_test.py index 114f003..711ff8e 100644 --- a/test/cbow_test.py +++ b/test/cbow_test.py @@ -1,5 +1,8 @@ # Set encoding to support Python 2 # -*- coding: utf-8 -*- + +# We use unicode_literals to generalize unicode syntax in plain string '' +# instead of u''. (to support python 3.2) from __future__ import unicode_literals import unittest from os import path diff --git a/test/skipgram_test.py b/test/skipgram_test.py index fe9219d..6f7474d 100644 --- a/test/skipgram_test.py +++ b/test/skipgram_test.py @@ -1,5 +1,8 @@ # Set encoding to support Python 2 # -*- coding: utf-8 -*- + +# We use unicode_literals to generalize unicode syntax in plain string '' +# instead of u''. (to support python 3.2) from __future__ import unicode_literals import unittest from os import path diff --git a/test/supervised_params_test.txt b/test/supervised_params_test.txt new file mode 100644 index 0000000..3276782 --- /dev/null +++ b/test/supervised_params_test.txt @@ -0,0 +1,100 @@ +__label__2 , birchas chaim , yeshiva birchas chaim is a orthodox jewish mesivta high school in lakewood township new jersey . it was founded by rabbi shmuel zalmen stein in 2001 after his father rabbi chaim stein asked him to open a branch of telshe yeshiva in lakewood . as of the 2009-10 school year the school had an enrollment of 76 students and 6 . 6 classroom teachers ( on a fte basis ) for a student–teacher ratio of 11 . 5 1 . +__label__6 , motor torpedo boat pt-41 , motor torpedo boat pt-41 was a pt-20-class motor torpedo boat of the united states navy built by the electric launch company of bayonne new jersey . the boat was laid down as motor boat submarine chaser ptc-21 but was reclassified as pt-41 prior to its launch on 8 july 1941 and was completed on 23 july 1941 . +__label__11 , passiflora picturata , passiflora picturata is a species of passion flower in the passifloraceae family . +__label__13 , naya din nai raat , naya din nai raat is a 1974 bollywood drama film directed by a . bhimsingh . the film is famous as sanjeev kumar reprised the nine-role epic performance by sivaji ganesan in navarathri ( 1964 ) which was also previously reprised by akkineni nageswara rao in navarathri ( telugu 1966 ) . this film had enhanced his status and reputation as an actor in hindi cinema . +__label__11 , copiapoa coquimbana , copiapoa coquimbana is a species of clump-forming cactus native to south america . the plant bears 3 cm ( 1 . 2 in ) long yellow flowers in summer and grows up to 60 cm ( 2 ft ) high and 1 m ( 3 ft ) across . the species is named after the city of coquimbo in chile . variations include c . coquimbana var . fiedleriana c . coquimbana var . wagenknechtii c . coquimbana var . vallenarensis and c . coquimbana subsp . andina . +__label__4 , lester holmes , lester holmes ( born september 27 1969 in tylertown mississippi ) is a former american football offensive lineman in the national football league . he played college football at jackson state university and was drafted in the first round of the 1993 nfl draft . +__label__13 , metro manila ( film ) , metro manila is 2013 british-filipino independently produced crime drama film directed by sean ellis . ellis also co-produced and co-wrote the film . the film was selected as the british entry for the best foreign language film at the 86th academy awards but it was not nominated . +__label__2 , seafield convent grammar school , the seafield convent of the sacred heart of mary ( 1908–1976 ) was a roman catholic convent school for girls run by the religious of the sacred heart of mary founded in bootle england in 1872 . the school soon moved to siefield house in seaforth which gave it the name by which it is best known . the school moved to liverpool road crosby in 1905 . it merged with st bede ' s secondary modern in 1976 to form sacred heart catholic college . +__label__2 , all hallows catholic school , all hallows catholic school is a mixed voluntary-aided comprehensive secondary school and sixth form in weybourne farnham surrey england . the school offers many courses including a-level and btec business . as of 2014 gcse examination results are in the top quintile . +__label__10 , clarkeulia simera , clarkeulia simera is a species of moth of the tortricidae family . it is found in brazil ( santa catarina ) . +__label__4 , marko anttila , marko anttila ( born may 25 1985 in lempäälä finland ) is a professional ice hockey player . anttila currently plays for metallurg novokuznetsk of the khl and was drafted by chicago blackhawks in 2004 260th overall pick . +__label__12 , suck out the poison , suck out the poison is he is legend ' s second full-length studio album and was released on october 3 2006 . it was produced and mixed by steve evetts . it received mixed reviews from long-time fans due to a major difference in vocalist schuylar croom ' s vocals . the faq section of their website addresses this +__label__13 , legend of the lost , legend of the lost is a 1957 italian-american adventure film produced and directed by henry hathaway shot in technirama by jack cardiff and starring john wayne sophia loren and rossano brazzi . the location shooting for the film took place near tripoli libya . +__label__14 , a flea in her ear , a flea in her ear ( in french la puce à l ' oreille ) is a play by georges feydeau written in 1907 at the height of the belle Époque . +__label__6 , hms britomart ( j22 ) , hms britomart was a halcyon-class minesweeper of the royal navy . she served during the second world war and was sunk in 1944 in a friendly fire incident . +__label__3 , jean galbert de campistron , jean galbert de campistron ( 1656 – 11 may 1723 ) was a french dramatist +__label__5 , jim rubens , jim rubens is an american politician from the state of new hampshire . a member of the republican party rubens served in the new hampshire senate for two terms . +__label__3 , chuck lukacs , chuck lukacs is an artist whose work has appeared in role-playing games . +__label__9 , tulsi chauda , tulsi chauda is a village development committee in dhanusa district in the janakpur zone of south-eastern nepal . at the time of the 1991 nepal census it had a population of 3451 . +__label__9 , rodowo pomeranian voivodeship , rodowo [rɔˈdɔvɔ] is a village in the administrative district of gmina prabuty within kwidzyn county pomeranian voivodeship in northern poland . it lies approximately 9 kilometres ( 6 mi ) north of prabuty 22 km ( 14 mi ) northeast of kwidzyn and 71 km ( 44 mi ) southeast of the regional capital gdańsk . before 1945 the area was part of germany . for the history of the region see history of pomerania . the village has a population of 200 . +__label__4 , ken hardwick , kenneth ken hardwick ( 6 january 1931 – 4 june 1977 ) was an english footballer who played as a goalkeeper for doncaster rovers scunthorpe united and barrow . +__label__6 , ss friedrich bischoff , friedrich bischoff was a 1998 grt cargo ship that was built in 1940 by lübecker maschinenbau gesellschaft lübeck germany for german owners . she was sunk in an air raid in 1943 but later salvaged and returned to service . she was seized by the allies in april 1945 passed to the ministry of war transport ( mowt ) and renamed empire consequence . in 1947 she was transferred to the united states and was sold into merchant service the following year . +__label__9 , kurdeh larestan , kurdeh ( persian كورده‎ also romanized as kūrdeh also known as kūrdeh-e lār and kur-deh-lâr ) is a village in dehkuyeh rural district in the central district of larestan county fars province iran . at the 2006 census its population was 2927 in 698 families . +__label__10 , white-browed bulbul , the white-browed bulbul ( pycnonotus luteolus ) is a member of the bulbul family of passerine birds . it is a resident breeder in sri lanka and peninsular india . largely olive coloured above with whitish underparts it has a pale supercilium and a yellow vent . they are found in dense scrub habitats where they skulk within vegetation and can be difficult to see although their loud and distinct burst of calls is distinctive . +__label__3 , ruby winters , ruby winters is an american female soul singer whose records made the singles charts in both the us and uk in the 1960s and 1970s . +__label__4 , sergio momesso , sergio francesco momesso ( born september 4 1965 in montreal quebec ) is a retired professional ice hockey player who spent 13 seasons in the national hockey league between 1983 and 1997 . +__label__8 , aquiles lake , laguna Áquiles is a lake in the beni department bolivia . at an elevation of 224 m its surface area is 19 km² . +__label__5 , michael p . mcauliffe , michael p . mcauliffe is a republican member of the illinois house of representatives representing the 20th district in northwestern chicago . he has served since 1997 when he was elected following the death of his father illinois state representative roger mcauliffe . he is presently the only elected republican representing a significant portion of chicago above the county level . in much of his district he is the only elected republican at any level . +__label__2 , pinjarra senior high school , pinjarra senior high school is a comprehensive public high school located in pinjarra a regional centre 86 kilometres ( 53 mi ) east of perth western australia . +__label__4 , ted irvine , edward amos ted irvine ( born december 8 1944 ) is a canadian retired nhl hockey player . +__label__7 , canadian national depot , canadian national depot or warroad depot is a former railway station for the canadian national railway . the building now serves as an office for the city of warroad in the u . s . state of minnesota . +__label__14 , the sandstorm , the sandstorm is a history play written by playwright and iraq war veteran sean huze . it was originally an eleven character play ( los angeles 2004–2005 ) . it later became a ten character play after revisions for the east coast debut in washington d . c . ( 2005 ) . the play premiered in los angeles ca in september 2004 for a limited engagement directed by marlon hoffman at gardner stages a small 30 seat basement theater in hollywood ca . +__label__13 , barbie in a mermaid tale 2 , barbie in a mermaid tale 2 is a sequel to the 2010 barbie film barbie in a mermaid tale . it was released in theatres february 2012 and on dvd february 27 2012 in uk and march 6 2012 in us . the film also marks the return of kelly sheridan as the voice of barbie since barbie in a mermaid tale . +__label__2 , nicola valley institute of technology , nicola valley institute of technology ( nvit ) is british columbia’s aboriginal public post-secondary institute in merritt british columbia canada . it started in 1983 . +__label__14 , un soir au club ( novel ) , un soir au club is a novel by christian gailly published on 7 january 2001 by éditions de minuit which won the prix du livre inter prize the next year . the novel was adapted for the screen and became the 2009 film un soir au club directed by jean achache . +__label__11 , schefflera hierniana , schefflera hierniana is a species of plant in the araliaceae family . it is found in cameroon and equatorial guinea . its natural habitats are subtropical or tropical moist lowland forests and subtropical or tropical moist montane forests . it is threatened by habitat loss . +__label__14 , the planter ' s northern bride , the planter ' s northern bride is an 1854 novel written by caroline lee hentz in response to the publication of uncle tom ' s cabin by harriet beecher stowe in 1852 . +__label__2 , davenport university , davenport university is a private non-profit multi-location university located at 11 campuses throughout michigan and online . it was founded in 1866 by conrad swensburg and currently offers associate ' s bachelor ' s and master ' s degrees diplomas and post-grad certification programs in business technology health professions and graduate studies ( mba ) . davenport ' s w . a . lettinga main campus is located outside of grand rapids michigan . +__label__1 , virata corporation , virata corporation is an inactive acquired company that was a major contributor to the cambridge phenomenon or silicon fen high-tech cluster in the united kingdom . case studies and research papers have been created to illustrate the role of social networking in the creation of virata ' s success . there is also research available on the role the company played in silicon valley venture networks . +__label__13 , puerto escondido ( film ) , puerto escondido is a 1992 italian comedy film directed by gabriele salvatores . for this film diego abatantuono and renato carpentieri were awarded with silver ribbons for best actor and best supporting actor . +__label__9 , mączno , mączno [ˈmɔnt͡ʂnɔ] is a village in the administrative district of gmina będzino within koszalin county west pomeranian voivodeship in north-western poland . it lies approximately 2 kilometres ( 1 mi ) south-east of będzino 12 km ( 7 mi ) west of koszalin and 129 km ( 80 mi ) north-east of the regional capital szczecin . before 1945 the area was part of germany . for the history of the region see history of pomerania . +__label__2 , regional institute of paramedical and nursing aizawl , regional institute of paramedical and nursing sciences ( ripans ) is located at zemabawk on the slope of a hill generously donated by the government of mizoram . +__label__4 , monte scheinblum , monte scheinblum ( born may 15 1967 ) is an american professional golfer and the son of former major league baseball all star outfielder richie scheinblum . while he competed on the nike tour he became known especially for his long driving where success is achieved by a golfer hitting a golf ball the farthest . in 1992 he won the national long driving championship in the united states and was the world long driving champion . +__label__7 , coral museum , the coral museum ( chinese 珊瑚法界博物館 pinyin shānhú fǎjiè bówùguǎn ) is a museum in su-ao township yilan county taiwan . +__label__14 , the big necessity , the big necessity the unmentionable world of human waste and why it matters ( published in the united kingdom as the big necessity adventures in the world of human waste ) written by rose george is a descriptive representation of the history advancement cultural variation solutions and international need of sanitation . this work written for the purpose of global awareness of sanitation highlights the current state of a global crisis . +__label__1 , hannah banana bread company , hannah banana bread company is an american baked goods company founded in 2000 to provide stuff to specialty food retailers and consumers . the company ' s baked goods are supposedly derived from family recipes and contain no additives or preservatives . [citation needed] all products are named after family members and distributed to a wide variety of food service and retail customers on a wholesale basis as well as direct to consumers through the company ' s website . +__label__8 , chalinga river , the chalinga river is a river of chile . +__label__13 , i am curious ( yellow ) , i am curious ( yellow ) ( swedish jag är nyfiken – en film i gult meaning i am curious a film in yellow ) is a 1967 swedish drama film written and directed by vilgot sjöman and starring sjöman and lena nyman . it is a companion film to 1968 ' s i am curious ( blue ) the two were initially intended to be one 3½ hour film . the films are named after the colours of the swedish flag . +__label__2 , salem state university , salem state university is a four-year public university located in salem massachusetts . salem state university established in 1854 as salem normal school is located approximately fifteen miles north of boston massachusetts . salem state enrolls over 10000 undergraduate and graduate students from 27 states and 57 foreign countries . from 1968 to 2010 the institution was named salem state college . as of 2010 salem state enrolled 5894 undergraduate and 343 graduate full-time students . +__label__14 , blindsight ( cook novel ) , blindsight is a novel by robin cook . like most of cook ' s other work it is a medical thriller . this story introduces new york city pathologist laurie montgomery as being new to the medical examiner ' s office . she uncovers a series of drug overdoses and gangland-style murders with a grisly twist . an abnormal increase in the number of drug overdose cases makes laurie seriously suspicious and she starts investigating these cases . +__label__3 , marcel diallo , marcel diallo is an american musician poet artist and community builder known for his founding of the black dot artists collective ( later black dot artists inc . ) the black new world and his revitalization efforts in west oakland ' s historic predominantly african-american prescott neighborhood aka the lower bottoms . +__label__6 , oscar-class submarine , project 949 ( granit ) and project 949a ( antey ) are soviet navy/russian navy cruise missile submarines ( nato reporting names oscar-i and oscar-ii respectively ) . project 949 submarines were the largest cruise missile submarines in service until the ohio-class ssgn cruise missile submarine converted from ssbn and returned to service on october 15 2007 . they are the fourth largest class of submarines in terms of displacement and length . +__label__2 , realschule hirschaid , realschule hirschaid is a realschule in the town of hirschaid bamberg germany . it stands between the autobahn and the rhine–main–danube canal . +__label__6 , british rail class 332 , british rail class 332 electric multiple units are used by heathrow express between london paddington and heathrow airport . +__label__3 , george herbert palmer , george herbert palmer ( march 9 1842 – may 8 1933 ) was an american scholar and author born in boston . he attended phillips academy andover and in 1864 he graduated at harvard to which he returned after study at tübingen germany and at andover theological seminary to be tutor in greek . he became alford professor of natural religion moral philosophy and civil polity at harvard ( 1889–1913 ) . in 1887 he married as his second wife alice freeman palmer . +__label__12 , rythm syndicate ( album ) , rythm syndicate is the full-length debut album released by the dance-rock band rythm syndicate the group formed by songwriter-producers carl sturken and evan rogers . it was released in 1991 on impact a subsidiary of mca and produced ( and mostly written ) by sturken/rogers . three singles were released p . a . s . s . i . o . n . charted on the billboard hot 100 peaking at number 3 while the opening track hey donna peaked just outside the top 10 reaching number 13 . +__label__6 , uss viking ( sp-3314 ) , the second uss viking ( sp-3314 ) was a united states navy patrol vessel in service from 1918 to 1919 . viking was built in 1915 as the motorboat caesar by the u . s . navy and sold to civilian civilian owner soon afterwards without seeing any naval service . caesar later was renamed viking while under private ownership . she operated at norfolk virginia while in private use . on 5 september 1918 the u . s . +__label__4 , samo kukovica , samo kukovica ( born february 2 1989 in brežice sfr yugoslavia ) is a slovenian motorcycle speedway rider . +__label__14 , the guardian ( belize ) , the guardian is a belizean newspaper and the official print organ of the united democratic party . it is published on thursdays and sells for bz$1 . 00 . +__label__4 , harald hasselbach , harald hasselbach ( born september 22 1967 ) is a former defensive end who played 7 seasons in the nfl for the denver broncos . he played for the broncos from 1994 to 2000 and was a starter in super bowl xxxiii . previously he played four seasons for the calgary stampeders of the cfl and also played in a grey cup winning team . +__label__13 , nutty professor ii the klumps , nutty professor ii the klumps is a 2000 science fiction romantic comedy film directed by peter segal . it is a sequel to the 1996 film the nutty professor and stars eddie murphy . like in the first one murphy plays not only the inept but brilliant scientist sherman klump but also ( wearing different but equally elaborate makeup ) most of sherman ' s family as well . +__label__2 , rmit school of management , rmit ' s school of management is an australian tertiary education school within the college of business at the royal melbourne institute of technology ( rmit university ) located in melbourne victoria . it is one of the best business schools in australia in terms of students ' satisfaction staff-student ration and research and publication from the academic staff members . [citation needed] +__label__13 , scrooge ( 1951 film ) , scrooge is a 1951 film adaptation of charles dickens ' s a christmas carol . it starred alastair sim as ebenezer scrooge and was directed by brian desmond hurst with a screenplay by noel langley . it was released as a christmas carol in the united states . the film also features kathleen harrison in an acclaimed turn as mrs . dilber scrooge ' s charwoman . george cole stars as the younger version of scrooge hermione baddeley as mrs . +__label__14 , superman secret origin , superman secret origin was a six-issue monthly american comic book limited series written by geoff johns and illustrated by gary frank starring the dc comics superhero superman . the story featured thedefinitive origin of superman for the modern post-infinite crisis dc universe continuity starting with clark kent in his pre-teens as superboy . +__label__10 , projapyx , projapyx is a genus of diplurans in the family projapygidae . +__label__14 , something wicked this way comes ( novel ) , something wicked this way comes is a 1962 novel by ray bradbury . it is about two 14-year-old boys jim nightshade and william halloway who have a harrowing experience with a nightmarish and bewitching traveling carnival that comes to their midwestern town on one october before halloween . the carnival ' s leader is the mysterious mr . dark who bears a tattoo for each person who lured by the offer to live out his secret fantasies has become bound in service to the carnival . mr . +__label__9 , dutluk taşova , dutluk is a village in the district of taşova amasya province turkey . +__label__5 , roger bedford jr . , roger h . bedford jr . born july 7 1956 is a democratic member of the alabama senate representing the 6th district since 1994 . he previously served from 1982 through 1990 . bedford received his education at the university of alabama and his law degree from cumberland school of law samford university . +__label__11 , pseudoacanthocereus , pseudoacanthocereus is a genus of cactus . +__label__2 , calvin christian school ( escondido california ) , calvin christian school is a private christian school in escondido california . it consists of a preschool elementary school junior high and high school . +__label__14 , the emperor of all maladies , the emperor of all maladies a biography of cancer is a book written by siddhartha mukherjee an indian-born american physician and oncologist . published on november 16 2010 by scribner it won the 2011 pulitzer prize for general nonfiction the jury called it an elegant inquiry at once clinical and personal . the book weaves together mukherjee ' s experiences as a hematology/oncology fellow at massachusetts general hospital as well as the history of cancer treatment and research . +__label__5 , mark hutchison , mark a . hutchison ( born may 51963 in las vegas nevada ) is an american attorney in private practice and a nevada politician . he was elected to the nevada state senate on november 6 2012 to represent senate district 6 which encompasses the northwest part of the las vegas valley including portions of the communities of summerlin desert shores and sun city . he is a member of the republican party . +__label__4 , steve stone ( footballer ) , steven brian steve stone ( born 20 august 1971 in gateshead tyne and wear ) is an english former footballer and now part of the coaching staff at newcastle united . +__label__5 , rudolf diels , rudolf diels ( 16 december 1900 - 18 november 1957 ) was a german politician and head of the gestapo in 1933-34 . he is also referred to as an ss-oberführer . he was a protégé of hermann göring . +__label__14 , nephilim ( manga ) , nephilim ( ネフィリム nefirimu ) is a shōjo manga by anna hanamaki . it was serialized in nemurenu yoru no kimyō na hanashi from 2004 to 2005 with the individual chapters published in two tankōbon volumes by asahi sonorama . it is licensed for an english language release in the united states by aurora publishing which began releasing the series in april 2008 . +__label__2 , cerge-ei , cerge-ei /sɜrdʒ . iː . aɪ/ is an academic institution that provides an american-style phd program in economics a us-chartered master of arts program in applied economics ( the mae program ) and the upces study abroad program . cerge-ei also conducts research in theoretical and policy-related economics . the institution is recognized as one of the top economics institutes in europe . the cerge-ei acronym stands for center for economic research and graduate education – economics institute . +__label__4 , ron carter ( basketball ) , ronald ron carter jr . ( born august 31 1956 in pittsburgh pennsylvania ) is a retired american basketball player . he played collegiately for the virginia military institute . he was a guard . carter was selected by the los angeles lakers in the 2nd round ( 26th pick overall ) of the 1978 nba draft . he played for the lakers ( 1978–79 ) and indiana pacers ( 1979–80 ) in the nba for 59 games . carter has 4 children 2 sons ronald carter iii and paul m . carter and 2 daughters bria a . carter and brooke a . +__label__10 , nishada sambara , nishada sambara is a moth of the arctiidae family . it is found on sumatra borneo java the sangihe islands bali and the philippines . the habitat consists of lowland forests . +__label__2 , annenberg school for communication at the university of pennsylvania , there are multiple annenberg schools . for the communications school at usc see usc annenberg school for communication . see also annenberg ( disambiguation ) . the annenberg school for communication is the communication school at the university of pennsylvania . the school was established in 1958 by wharton school ' s alum walter annenberg as the annenberg school of communications . the name was changed to its current title in the late 1980s . +__label__6 , hanriot hd . 8 , the hanriot hd . 8 was a short-lived french fighter prototype of the 1910s . +__label__4 , stanley elbers , stanley elbers ( born 14 may 1992 ) in the netherlands is a dutch football ( soccer ) player who plays as a striker . he currently plays for helmond sport in the eerste divisie . +__label__9 , brzyszewo , brzyszewo [bʐɨˈʂɛvɔ] ( german birken ) is a village in the administrative district of gmina chodecz within włocławek county kuyavian-pomeranian voivodeship in north-central poland . it lies approximately 4 kilometres ( 2 mi ) north of chodecz 25 km ( 16 mi ) south of włocławek and 73 km ( 45 mi ) south-east of toruń . +__label__3 , ed manion , ed manion ( born february 28 1952 ) also known as eddie kingfish manion is an american saxophonist who plays both tenor and baritone sax . he is best known as a member of bruce springsteen with the seeger sessions band tour later called bruce springsteen with the sessions band with the release of the cd/dvd bruce springsteen with the sessions band live in dublin in 2006 . +__label__3 , nancy ( musician ) , nazmun munir nancy is a bangladeshi singer . in 2006 nancy has got married with a businessman and has a three-year old baby girl named rodela . +__label__7 , turks and caicos national museum , the turks and caicos national museum is the national museum of the turks and caicos islands . it is located in guinep house on front street to the north of cockburn town on grand turk island which is also the capital of the archipelago . established in the 1980s and opened in 1991 the museum is publicly funded as a nonprofit trust . it exhibits pre-historic lucayan culture and records the history of the islands of the colonial era and the slave trade all related to the sea . +__label__7 , fauske church , fauske church ( norwegian fauske kirke ) is a parish church in the municipality of fauske in nordland county norway . it is located in the town of fauske . the church is part of the fauske parish in the salten deanery in the diocese of sør-hålogaland . the white wooden church was built in 1867 and it seats about 280 people . +__label__6 , lavochkin la-160 , the lavochkin la-160 known as strelka ( arrow ) was the first soviet swept winged jet fighter research prototype . it was designed and manufactured by the lavochkin design bureau from 1946 . usaf reporting name - type 6 +__label__5 , vasily vakhrushev , vasiliy vasilyevich vakhrushev ( russian Васи́лий Васи́льевич Ва́хрушев tula russian empire 15 february [o . s . 28 february] 1902 – moscow 13 january 1947 ) was a soviet-russian statesman who was from 1939 to 1940 the chairman of the council of people ' s commissars of the russian sfsr literally meaning premier or prime minister . +__label__5 , william d . burns , william d . burns is a member of the chicago city council representing chicago ' s 4th ward . a member of the democratic party burns represented the 26th district in the illinois house of representatives from 2008 through 2011 . after winning election as an alderman on february 22 2011 he stepped down from his role as a state legislator . +__label__8 , mitchel range , the mitchel range is a mountain range in san bernardino county california . +__label__2 , esena foundation high school , esena foundation high school or esena is a private fee-paying academic institution for girls only . it is located in gulberg lahore punjab pakistan . established in 1964 esena is the very first private education institute for girls in pakistan . esena ' s director begum majid was a very learned lady . she was the daughter of the late imam jafer the chief justice of india . [citation needed] +__label__1 , maykor , maykora russian nationwide provider of it and business processes outsourcing . the company is focused on comprehensive servicing of it equipment building utility systems and business applications . servicing constitutes 99 . 9% in the company’s total revenues . its own chain of 83 branches and 400 local business units spreads across russia enabling maykor to render one-stop-shop services . company ' s staff includes more than 4000 certified engineers . +__label__10 , conus paulae , conus paulae is a species of sea snail a marine gastropod mollusk in the family conidae the cone snails and their allies . like all species within the genus conus these snails are predatory and venomous . they are capable of stinging humans therefore live ones should be handled carefully or not at all . +__label__5 , andor lázár , andor lázár ( 8 march 1882 – 12 june 1971 ) was a hungarian politician and jurist who served as minister of justice between 1932 and 1938 . he was born into a hungarian calvinist family of noble origin in pápa . he learnt at the calvinist college of pápa and finished law studies in budapest . during his field trips he visited most of the countries of europe but he also went to canada and the united states . +__label__12 , re-foc , re-foc is the first widely available album by mexican guitar duo rodrigo y gabriela released in 2002 . some songs are re-recorded versions of those that appeared on foc while others were written for this album . +__label__11 , palicourea canarina , palicourea canarina is a species of plant in the rubiaceae family . it is endemic to ecuador . +__label__8 , atzenberger höhe , atzenberger höhe is a mountain of baden-württemberg germany . +__label__2 , kensington university , kensington university was an unaccredited distance education institution that was based at different times in hawaii and california . it was eventually shut down by state authorities in both states . +__label__3 , simone james , simone james is a british actress best known for her role as becca swanson in television soap opera eastenders . +__label__6 , engineering division xco-6 , the engineering division xco-6 was an american two-seat observation biplane designed by the united states army engineering division only two were built and the type did not enter production . diff --git a/test/supervised_test.py b/test/supervised_test.py new file mode 100644 index 0000000..89b1374 --- /dev/null +++ b/test/supervised_test.py @@ -0,0 +1,89 @@ +# Set encoding to support Python 2 +# -*- coding: utf-8 -*- + +# We use unicode_literals to generalize unicode syntax in plain string '' +# instead of u''. (to support python 3.2) +from __future__ import unicode_literals +import unittest +from os import path + +import fasttext as ft + +supervised_file = path.join(path.dirname(__file__), 'supervised_params_test.bin') +input_file = path.join(path.dirname(__file__), 'supervised_params_test.txt') +output = path.join(path.dirname(__file__), 'generated_supervised') + +# Test to make sure that supervised interface run correctly +class TestSupervisedModel(unittest.TestCase): + def test_load_supervised_model(self): + label_prefix='__label__' + model = ft.load_model(supervised_file, label_prefix=label_prefix) + + # Make sure the model is returned correctly + self.assertEqual(model.model_name, 'supervised') + + # Make sure all params loaded correctly + # see Makefile on target test-supervised for the params + self.assertEqual(model.dim, 10) + self.assertEqual(model.word_ngrams, 2) + self.assertEqual(model.min_count, 1) + self.assertEqual(model.epoch, 5) + self.assertEqual(model.bucket, 2000000) + + # Count how many labels are in the input_file + labels = [] + with open(input_file, 'r') as f: + for line in f: + line = line.decode('utf-8') + label = line.split(',')[0].strip() + label = label.replace(label_prefix, '') + if label in labels: + continue + else: + labels.append(label) + + # Make sure labels are loaded correctly + self.assertTrue(sorted(model.labels) == sorted(labels)) + + def test_create_supervised_model(self): + # set params + dim=10 + lr=0.005 + epoch=1 + min_count=1 + word_ngrams=3 + bucket=2000000 + thread=4 + silent=0 + label_prefix='__label__' + + # train supervised model + model = ft.supervised(input_file, output, dim=dim, lr=lr, epoch=epoch, + min_count=min_count, word_ngrams=word_ngrams, bucket=bucket, + thread=thread, silent=silent, label_prefix=label_prefix) + + # Make sure the model is generated correctly + self.assertEqual(model.dim, dim) + self.assertEqual(model.epoch, epoch) + self.assertEqual(model.min_count, min_count) + self.assertEqual(model.word_ngrams, word_ngrams) + self.assertEqual(model.bucket, bucket) + + # Count how many labels are in the input_file + labels = [] + with open(input_file, 'r') as f: + for line in f: + line = line.decode('utf-8') + label = line.split(',')[0].strip() + label = label.replace(label_prefix, '') + if label in labels: + continue + else: + labels.append(label) + + # Make sure .bin and .vec are generated + self.assertTrue(path.isfile(output + '.bin')) + self.assertTrue(path.isfile(output + '.vec')) + +if __name__ == '__main__': + unittest.main() From b8325725cd4bfa79fd08a210ea835d5d619bf585 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sat, 13 Aug 2016 03:55:34 +0700 Subject: [PATCH 012/109] Update API docs --- README.md | 47 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 1731a69..802ae3a 100644 --- a/README.md +++ b/README.md @@ -136,24 +136,31 @@ TODO: add test and predict ## API documentation -### Word vector model +### Skipgram model ```python -import fasttext - -# Skipgram model # Train & load skipgram model model = fasttext.skipgram(params) +``` +### CBOW model + +```python # CBOW model # Train & load CBOW model model = fasttext.cbow(params) +``` -# Load pre-trained model +### Load pre-trained model +```python # File .bin that previously trained or generated by fastText # can be loaded using this function model = fasttext.load_model('model.bin') +``` + +### Attributes and methods for the model +```python # Skipgram and CBOW model have the following atributes & methods model.model_name # Model name model.words # List of words in the dictionary @@ -172,18 +179,30 @@ model.t # Value of sampling threshold model.get_vector(word) # Get the vector of specified word model[word] # Get the vector of specified word +``` + +### Supervised model -# Supervised model +```python # Train & load the classifier classifier = fasttext.supervised(params) +``` -# Load pre-trained classifier -# File .bin that previously trained or generated by fastText -# can be loaded using this function +### Load pre-trained classifier +File `.bin` that previously trained or generated by fastText can be +loaded using this function + +```python # label_prefix is optional classifier = fasttext.load_model('classifier.bin', label_prefix='some_prefix') -# Classifier have the following atributes & methods +``` + +### Attributes and methods for the classifier + +Classifier have the following atributes & methods + +```python classifier.labels # List of labels classifier.dim # Size of word vector classifier.ws # Size of context window @@ -198,12 +217,16 @@ classifier.maxn # Max length of char ngram classifier.lr_update_rate # Rate of updates for the learning rate classifier.t # Value of sampling threshold -TODO: add classifier method here ``` +TODO: add classifier method here + + +### Params List of available `params` and their default value: ``` +For Skipgram, CBOW and Supervised model input training file path output output file path lr learning rate [0.05] @@ -222,7 +245,7 @@ thread number of threads [12] t sampling threshold [0.0001] silent disable the log output from the C++ extension [1] -# Supervised model only +For Supervised model only label_prefix Prefix of the label name [__label__] ``` From 67671125db40d89533b0852c90d98209cbf5f299 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sat, 13 Aug 2016 04:15:48 +0700 Subject: [PATCH 013/109] Update API docs --- README.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 802ae3a..cba79a4 100644 --- a/README.md +++ b/README.md @@ -138,25 +138,25 @@ TODO: add test and predict ### Skipgram model +Train & load skipgram model ```python -# Train & load skipgram model model = fasttext.skipgram(params) ``` ### CBOW model +Train & load CBOW model + ```python -# CBOW model -# Train & load CBOW model model = fasttext.cbow(params) ``` ### Load pre-trained model +File `.bin` that previously trained or generated by fastText can be +loaded using this function + ```python -# File .bin that previously trained or generated by fastText -# can be loaded using this function model = fasttext.load_model('model.bin') - ``` ### Attributes and methods for the model @@ -178,13 +178,12 @@ model.lr_update_rate # Rate of updates for the learning rate model.t # Value of sampling threshold model.get_vector(word) # Get the vector of specified word model[word] # Get the vector of specified word - ``` ### Supervised model +Train & load the classifier ```python -# Train & load the classifier classifier = fasttext.supervised(params) ``` From ad91f10d28f795c99af3a2e49311de0a8bd5dd00 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sat, 13 Aug 2016 04:16:08 +0700 Subject: [PATCH 014/109] Fix Supervised model test to support Python 3 --- test/supervised_test.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/test/supervised_test.py b/test/supervised_test.py index 89b1374..495a14b 100644 --- a/test/supervised_test.py +++ b/test/supervised_test.py @@ -34,7 +34,11 @@ def test_load_supervised_model(self): labels = [] with open(input_file, 'r') as f: for line in f: - line = line.decode('utf-8') + # str in python 3 doesn't have decode method + try: + line = line.decode('utf-8') + except: + line = line label = line.split(',')[0].strip() label = label.replace(label_prefix, '') if label in labels: @@ -73,7 +77,11 @@ def test_create_supervised_model(self): labels = [] with open(input_file, 'r') as f: for line in f: - line = line.decode('utf-8') + # str in python 3 doesn't have decode method + try: + line = line.decode('utf-8') + except: + line = line label = line.split(',')[0].strip() label = label.replace(label_prefix, '') if label in labels: From 600fe0ad795e2ab0e0bc74fbf0733c6737e14262 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sat, 13 Aug 2016 04:18:30 +0700 Subject: [PATCH 015/109] Update API docs --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cba79a4..b2b44ba 100644 --- a/README.md +++ b/README.md @@ -160,8 +160,9 @@ model = fasttext.load_model('model.bin') ``` ### Attributes and methods for the model +Skipgram and CBOW model have the following atributes & methods + ```python -# Skipgram and CBOW model have the following atributes & methods model.model_name # Model name model.words # List of words in the dictionary model.dim # Size of word vector From 2103d4aed96f270e11c8c608202b991d23912fd9 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sat, 13 Aug 2016 20:15:44 +0700 Subject: [PATCH 016/109] Refactoring the test --- test/cbow_test.py | 2 +- test/skipgram_test.py | 2 +- test/supervised_test.py | 62 ++++++++++++++++++++--------------------- 3 files changed, 32 insertions(+), 34 deletions(-) diff --git a/test/cbow_test.py b/test/cbow_test.py index 711ff8e..566d348 100644 --- a/test/cbow_test.py +++ b/test/cbow_test.py @@ -43,7 +43,7 @@ def test_load_cbow_model(self): self.assertTrue(unicode_str in model.words) self.assertEqual(len(model[unicode_str]), model.dim) - def test_create_cbow_model(self): + def test_train_cbow_model(self): # set params lr=0.005 dim=10 diff --git a/test/skipgram_test.py b/test/skipgram_test.py index 6f7474d..9c9b9e1 100644 --- a/test/skipgram_test.py +++ b/test/skipgram_test.py @@ -43,7 +43,7 @@ def test_load_skipgram_model(self): self.assertTrue(unicode_str in model.words) self.assertEqual(len(model[unicode_str]), model.dim) - def test_create_skipgram_model(self): + def test_train_skipgram_model(self): # set params lr=0.005 dim=10 diff --git a/test/supervised_test.py b/test/supervised_test.py index 495a14b..10afe56 100644 --- a/test/supervised_test.py +++ b/test/supervised_test.py @@ -13,6 +13,30 @@ input_file = path.join(path.dirname(__file__), 'supervised_params_test.txt') output = path.join(path.dirname(__file__), 'generated_supervised') +def read_labels(filename, label_prefix): + labels = [] + with open(filename, 'r') as f: + for line in f: + # Python 2 read file in ASCII encoding by default + # so we need to decode the str to UTF-8 first. + # But, in Python 3, str doesn't have decode method + # so this decoding step make the test fails. + # Python 3 read file in UTF-8 encoding by default so + # we wrap this in the try-except to support both Python 2 + # and Python 3 + try: + line = line.decode('utf-8') + except: + line = line + + label = line.split(',')[0].strip() + label = label.replace(label_prefix, '') + if label in labels: + continue + else: + labels.append(label) + return labels + # Test to make sure that supervised interface run correctly class TestSupervisedModel(unittest.TestCase): def test_load_supervised_model(self): @@ -30,26 +54,13 @@ def test_load_supervised_model(self): self.assertEqual(model.epoch, 5) self.assertEqual(model.bucket, 2000000) - # Count how many labels are in the input_file - labels = [] - with open(input_file, 'r') as f: - for line in f: - # str in python 3 doesn't have decode method - try: - line = line.decode('utf-8') - except: - line = line - label = line.split(',')[0].strip() - label = label.replace(label_prefix, '') - if label in labels: - continue - else: - labels.append(label) + # Read labels from the the input_file + labels = read_labels(input_file, label_prefix) # Make sure labels are loaded correctly self.assertTrue(sorted(model.labels) == sorted(labels)) - def test_create_supervised_model(self): + def test_train_classifier(self): # set params dim=10 lr=0.005 @@ -61,7 +72,7 @@ def test_create_supervised_model(self): silent=0 label_prefix='__label__' - # train supervised model + # Train the classifier model = ft.supervised(input_file, output, dim=dim, lr=lr, epoch=epoch, min_count=min_count, word_ngrams=word_ngrams, bucket=bucket, thread=thread, silent=silent, label_prefix=label_prefix) @@ -73,21 +84,8 @@ def test_create_supervised_model(self): self.assertEqual(model.word_ngrams, word_ngrams) self.assertEqual(model.bucket, bucket) - # Count how many labels are in the input_file - labels = [] - with open(input_file, 'r') as f: - for line in f: - # str in python 3 doesn't have decode method - try: - line = line.decode('utf-8') - except: - line = line - label = line.split(',')[0].strip() - label = label.replace(label_prefix, '') - if label in labels: - continue - else: - labels.append(label) + # Read labels from the the input_file + labels = read_labels(input_file, label_prefix) # Make sure .bin and .vec are generated self.assertTrue(path.isfile(output + '.bin')) From 1ac0ad4d6f60a2bfca9f0332828c5d545867ebf7 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sun, 14 Aug 2016 00:12:52 +0700 Subject: [PATCH 017/109] Add classifier.test interface --- fasttext/fasttext.pyx | 7 +++++++ fasttext/interface.cc | 45 ++++++++++++++++++++++++++++++++++++------ fasttext/interface.h | 7 +++++-- fasttext/interface.pxd | 1 + fasttext/model.py | 4 ++++ 5 files changed, 56 insertions(+), 8 deletions(-) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index d221746..1bc24cc 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -29,6 +29,13 @@ cdef class FastTextModelWrapper: word_bytes = bytes(word, 'utf-8') return self.fm.getVectorWrapper(word_bytes) + def classifier_test(self, test_file): + test_file = bytes(test_file, 'utf-8') + result = self.fm.classifierTest(test_file) + precision_at_one = float(result[0]) + num_examples = int(result[1]) + return precision_at_one, num_examples + @property def dim(self): return self.fm.dim diff --git a/fasttext/interface.cc b/fasttext/interface.cc index 80cdffc..3656bb4 100644 --- a/fasttext/interface.cc +++ b/fasttext/interface.cc @@ -34,9 +34,10 @@ void FastTextModel::setDict(Dictionary dict) _dict = dict; } -void FastTextModel::setMatrix(Matrix matrix) +void FastTextModel::setMatrix(Matrix input, Matrix output) { - _matrix = matrix; + _input_matrix = input; + _output_matrix = output; } void FastTextModel::setArg(Args arg) @@ -70,16 +71,48 @@ void FastTextModel::setArg(Args arg) maxn = arg.maxn; lrUpdateRate = arg.lrUpdateRate; t = arg.t; + lr = arg.lr; } std::vector FastTextModel::getVectorWrapper(std::string word) { Vector vec(dim); - getVector(_dict, _matrix, vec, word); + getVector(_dict, _input_matrix, vec, word); std::vector vector(vec.data_, vec.data_ + vec.m_); return vector; } +std::vector FastTextModel::classifierTest(std::string filename) +{ + /* Initialize the model */ + Model model(_input_matrix, _output_matrix, dim, lr, 1); + int32_t nexamples = 0; + double precision = 0.0; + std::vector line, labels; + std::ifstream ifs(filename); + if(!ifs.is_open()) { + std::cerr << "interface.cc: Test file cannot be opened!" << std::endl; + exit(EXIT_FAILURE); + } + + while (ifs.peek() != EOF) { + _dict.getLine(ifs, line, labels, model.rng); + _dict.addNgrams(line, wordNgrams); + if(labels.size() > 0 && line.size() > 0) { + int32_t i = model.predict(line); + if(std::find(labels.begin(), labels.end(), i) != labels.end()) { + precision += 1.0; + } + nexamples++; + } + } + + ifs.close(); + std::setprecision(3); + std::vector result = {precision/nexamples, nexamples}; + return result; +} + void trainWrapper(int argc, char **argv, int silent) { /* output file stream to redirect output from fastText library */ @@ -109,10 +142,10 @@ void loadModelWrapper(std::string filename, FastTextModel& model) * We parse it to the model, so we not depend on it anymore */ model.setArg(args); model.setDict(dict); - model.setMatrix(input); + model.setMatrix(input, output); - /* Do the indexing on Cython instead to support unicode - * instead of plain bytes */ + /* Do the indexing on Cython to support unicode instead of plain + * bytes */ /* for(int32_t i = 0; i < dict.nwords(); i++) { std::string word = dict.getWord(i); diff --git a/fasttext/interface.h b/fasttext/interface.h index 05cdf48..73b14b7 100644 --- a/fasttext/interface.h +++ b/fasttext/interface.h @@ -13,7 +13,8 @@ class FastTextModel { private: std::vector _words; Dictionary _dict; - Matrix _matrix; + Matrix _input_matrix; + Matrix _output_matrix; public: FastTextModel(); @@ -28,15 +29,17 @@ class FastTextModel { int bucket; int minn; int maxn; + double lr; int lrUpdateRate; double t; std::vector getWords(); std::vector getVectorWrapper(std::string word); + std::vector classifierTest(std::string filename); void addWord(std::string word); void setDict(Dictionary dict); - void setMatrix(Matrix matrix); + void setMatrix(Matrix input, Matrix output); void setArg(Args arg); Dictionary getDictionary(); diff --git a/fasttext/interface.pxd b/fasttext/interface.pxd index 19d5119..8c2e209 100644 --- a/fasttext/interface.pxd +++ b/fasttext/interface.pxd @@ -35,6 +35,7 @@ cdef extern from "interface.h": vector[string] getWords() vector[real] getVectorWrapper(string word) + vector[double] classifierTest(string filename) Dictionary getDictionary() diff --git a/fasttext/model.py b/fasttext/model.py index bac1e17..03040b6 100644 --- a/fasttext/model.py +++ b/fasttext/model.py @@ -54,3 +54,7 @@ def __init__(self, model, labels): self.maxn = model.maxn; self.lr_update_rate = model.lrUpdateRate; self.t = model.t; + + def test(self, test_file): + return self._model.classifier_test(test_file) + From f5c242b3a3ce09a6add73da771a9576215fe4d75 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sun, 14 Aug 2016 00:13:26 +0700 Subject: [PATCH 018/109] Add test for the classifier.test interface --- Makefile | 7 ++++++- test/supervised_test.py | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8a76cbe..9d04fa6 100644 --- a/Makefile +++ b/Makefile @@ -45,5 +45,10 @@ test/supervised_params_test.bin: -output test/supervised_params_test -dim 10 -lr 0.1 -wordNgrams 2 \ -minCount 1 -bucket 2000000 -epoch 5 -thread 4 -test-supervised: fasttext/cpp/fasttext test/supervised_params_test.bin +test/supervised_test_result.txt: test/supervised_params_test.bin + ./fasttext/cpp/fasttext test test/supervised_params_test.bin \ + test/supervised_params_test.txt > test/supervised_test_result.txt + +test-supervised: fasttext/cpp/fasttext test/supervised_params_test.bin \ + test/supervised_test_result.txt python test/supervised_test.py --verbose diff --git a/test/supervised_test.py b/test/supervised_test.py index 10afe56..20e0068 100644 --- a/test/supervised_test.py +++ b/test/supervised_test.py @@ -12,6 +12,8 @@ supervised_file = path.join(path.dirname(__file__), 'supervised_params_test.bin') input_file = path.join(path.dirname(__file__), 'supervised_params_test.txt') output = path.join(path.dirname(__file__), 'generated_supervised') +test_result = path.join(path.dirname(__file__), 'supervised_test_result.txt') +test_file = input_file # Only for test def read_labels(filename, label_prefix): labels = [] @@ -91,5 +93,23 @@ def test_train_classifier(self): self.assertTrue(path.isfile(output + '.bin')) self.assertTrue(path.isfile(output + '.vec')) + def test_classifier_test(self): + # Read the test result from fasttext(1) using the same classifier model + precision_at_one = 0.0 + num_examples = 0 + with open(test_result) as f: + lines = f.readlines() + precision_at_one = float(lines[0][5:].strip()) + num_examples = int(lines[1][20:].strip()) + + # Load and test using the same model and test set + classifier = ft.load_model(supervised_file, label_prefix='__label__') + p_at_1, num_ex = classifier.test(test_file) + + # Make sure that the test result is the same as the result generated + # by fasttext(1) + self.assertEqual(p_at_1, precision_at_one) + self.assertEqual(num_ex, num_examples) + if __name__ == '__main__': unittest.main() From d3edb045d3846fda46cae4eeb9bf9dd3fb661831 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sun, 14 Aug 2016 00:14:15 +0700 Subject: [PATCH 019/109] Ignore the test result file from fasttext(1) --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 85b5201..c603341 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,7 @@ facebookresearch-fasttext-* .idea/ # pip -.eggs/ \ No newline at end of file +.eggs/ + +# For test +test/supervised_test_result.txt From f9a74bbdd0c1b095bcd6c949be5554a6d3b552d8 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sun, 14 Aug 2016 01:00:18 +0700 Subject: [PATCH 020/109] Using default value of args.lr when initialize a Model --- fasttext/interface.cc | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fasttext/interface.cc b/fasttext/interface.cc index 3656bb4..641be40 100644 --- a/fasttext/interface.cc +++ b/fasttext/interface.cc @@ -84,8 +84,13 @@ std::vector FastTextModel::getVectorWrapper(std::string word) std::vector FastTextModel::classifierTest(std::string filename) { - /* Initialize the model */ - Model model(_input_matrix, _output_matrix, dim, lr, 1); + /* Initialize the model + * We use default value of learning rate here, since the fasttext(1) test + * command also use the default value. + * https://github.com/facebookresearch/fastText/blob/9bfa32d/src/fasttext.cc#L307 + * (generated model.bin file doesn't contain the learning rate info, args.lr + * will have the default value when model.bin loaded) */ + Model model(_input_matrix, _output_matrix, dim, args.lr, 1); int32_t nexamples = 0; double precision = 0.0; std::vector line, labels; From 5aeb515c9b754e1b1c5674a441b9a187c5a69c83 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sun, 14 Aug 2016 01:01:58 +0700 Subject: [PATCH 021/109] Add classifier.test to the API docs --- README.md | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b2b44ba..fc01b54 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,22 @@ equivalent as `fasttext(1)` command: This will output two files: `model.bin` and `model.vec`. -TODO: add test and predict +Once the model was trained, we can evaluate it by computing the precision +at 1 (P@1) on a test set using `classifier.test` function: + +```python +precision_at_one, nexamples = classifier.test('test.txt') +print 'P@1:', precision_at_one +print 'Number of examples:', nexamples +``` + +This will print the same output to stdout as: + +```shell +./fasttext test model.bin test.txt +``` + +TODO: add predict method ## API documentation @@ -184,6 +199,7 @@ model[word] # Get the vector of specified word ### Supervised model Train & load the classifier + ```python classifier = fasttext.supervised(params) ``` @@ -195,7 +211,15 @@ loaded using this function ```python # label_prefix is optional classifier = fasttext.load_model('classifier.bin', label_prefix='some_prefix') +``` +### Test classifier +This is equivalent as `fasttext(1)` test command. The test using the same +model and test set will produce the same value for the precision at one +and the number of examples. + +```python +precision_at_one, nexamples = classifier.test(test_file) ``` ### Attributes and methods for the classifier @@ -216,7 +240,7 @@ classifier.minn # Min length of char ngram classifier.maxn # Max length of char ngram classifier.lr_update_rate # Rate of updates for the learning rate classifier.t # Value of sampling threshold - +clsasifier.test(filename) # Test the classifier ``` TODO: add classifier method here From 74e7b4fb92077bc894dde88f783c75b79553da00 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sun, 14 Aug 2016 01:10:34 +0700 Subject: [PATCH 022/109] Update API docs --- README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fc01b54..160e62f 100644 --- a/README.md +++ b/README.md @@ -120,8 +120,7 @@ that are prefixed by the string `__label__`. We can specify the label prefix with the `label_prefix` param: ```python -classifier = fasttext.supervised('data.train.txt', 'model', - label_prefix='__label__') +classifier = fasttext.supervised('data.train.txt', 'model', label_prefix='__label__') ``` equivalent as `fasttext(1)` command: @@ -204,12 +203,16 @@ Train & load the classifier classifier = fasttext.supervised(params) ``` + ### Load pre-trained classifier File `.bin` that previously trained or generated by fastText can be -loaded using this function +loaded using this function. + +```shell +./fasttext supervised -input train.txt -output classifier -label 'some_prefix' +``` ```python -# label_prefix is optional classifier = fasttext.load_model('classifier.bin', label_prefix='some_prefix') ``` From 0f428a0082d2b4b4fd59a6cce4cbf23dbc6f98ae Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sun, 14 Aug 2016 01:19:11 +0700 Subject: [PATCH 023/109] Explicitly cast the nexamples to double before returning the test result --- fasttext/interface.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fasttext/interface.cc b/fasttext/interface.cc index 641be40..cbc404d 100644 --- a/fasttext/interface.cc +++ b/fasttext/interface.cc @@ -114,7 +114,7 @@ std::vector FastTextModel::classifierTest(std::string filename) ifs.close(); std::setprecision(3); - std::vector result = {precision/nexamples, nexamples}; + std::vector result = {precision/nexamples, (double)nexamples}; return result; } From 2b2acb32ddfef607345ebf86b8bd60f269aee948 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sun, 14 Aug 2016 21:17:57 +0700 Subject: [PATCH 024/109] Add classifier.predict(texts) interface --- fasttext/fasttext.pyx | 9 +++++++- fasttext/interface.cc | 47 ++++++++++++++++++++++++++++++++++++++++++ fasttext/interface.h | 2 ++ fasttext/interface.pxd | 1 + fasttext/model.py | 10 ++++++++- 5 files changed, 67 insertions(+), 2 deletions(-) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index 1bc24cc..68c4933 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -36,6 +36,13 @@ cdef class FastTextModelWrapper: num_examples = int(result[1]) return precision_at_one, num_examples + def classifier_predict(self, text): + cdef string cpp_string + text_bytes = bytes(text, 'utf-8') + cpp_string = self.fm.classifierPredict(text_bytes) + label = cpp_string.decode('utf-8') + return label + @property def dim(self): return self.fm.dim @@ -120,7 +127,7 @@ def load_model(filename, label_prefix=''): label = cpp_string.decode('utf-8') # Remove the prefix labels.append(label.replace(label_prefix, '')) - return SupervisedModel(model, labels) + return SupervisedModel(model, labels, label_prefix) else: raise ValueError('fastText: model name is not valid!') diff --git a/fasttext/interface.cc b/fasttext/interface.cc index cbc404d..5d07a22 100644 --- a/fasttext/interface.cc +++ b/fasttext/interface.cc @@ -1,5 +1,6 @@ /* An interface for fastText */ #include +#include #include #include @@ -9,6 +10,7 @@ #include "cpp/src/dictionary.h" #include "cpp/src/matrix.h" #include "cpp/src/vector.h" +#include "cpp/src/model.h" #include "cpp/src/fasttext.cc" @@ -91,6 +93,8 @@ std::vector FastTextModel::classifierTest(std::string filename) * (generated model.bin file doesn't contain the learning rate info, args.lr * will have the default value when model.bin loaded) */ Model model(_input_matrix, _output_matrix, dim, args.lr, 1); + model.setTargetCounts(_dict.getCounts(entry_type::label)); + int32_t nexamples = 0; double precision = 0.0; std::vector line, labels; @@ -118,6 +122,49 @@ std::vector FastTextModel::classifierTest(std::string filename) return result; } +std::string FastTextModel::classifierPredict(std::string text) +{ + /* Initialize the model + * We use default value of learning rate here, since the fasttext(1) test + * command also use the default value. + * https://github.com/facebookresearch/fastText/blob/9bfa32d/src/fasttext.cc#L307 + * (generated model.bin file doesn't contain the learning rate info, args.lr + * will have the default value when model.bin loaded) */ + Model model(_input_matrix, _output_matrix, dim, args.lr, 1); + model.setTargetCounts(_dict.getCounts(entry_type::label)); + std::minstd_rand rng = model.rng; + std::uniform_real_distribution<> uniform(0, 1); + + /* Hardcoded here; since we need this variable but the variable + * is private in dictionary.h */ + const int32_t max_line_size = 1024; + + /* List of word ids */ + std::vector text_word_ids; + std::istringstream iss(text); + std::string token; + + /* We implement the same logic as Dictionary::getLine */ + while(iss >> token) { + int32_t word_id = _dict.getId(token); + if(word_id < 0) continue; + entry_type type = _dict.getType(word_id); + if (type == entry_type::word && !_dict.discard(word_id, uniform(rng))) { + text_word_ids.push_back(word_id); + } + if(text_word_ids.size() > max_line_size) break; + } + _dict.addNgrams(text_word_ids, wordNgrams); + + if(text_word_ids.size() > 0) { + int32_t i = model.predict(text_word_ids); + return _dict.getLabel(i); + } else { + return "n/a"; + } + +} + void trainWrapper(int argc, char **argv, int silent) { /* output file stream to redirect output from fastText library */ diff --git a/fasttext/interface.h b/fasttext/interface.h index 73b14b7..1392085 100644 --- a/fasttext/interface.h +++ b/fasttext/interface.h @@ -36,6 +36,7 @@ class FastTextModel { std::vector getWords(); std::vector getVectorWrapper(std::string word); std::vector classifierTest(std::string filename); + std::string classifierPredict(std::string text); void addWord(std::string word); void setDict(Dictionary dict); @@ -49,3 +50,4 @@ void trainWrapper(int argc, char **argv, int silent); void loadModelWrapper(std::string filename, FastTextModel& model); #endif + diff --git a/fasttext/interface.pxd b/fasttext/interface.pxd index 8c2e209..6b2a963 100644 --- a/fasttext/interface.pxd +++ b/fasttext/interface.pxd @@ -36,6 +36,7 @@ cdef extern from "interface.h": vector[string] getWords() vector[real] getVectorWrapper(string word) vector[double] classifierTest(string filename) + string classifierPredict(string text) Dictionary getDictionary() diff --git a/fasttext/model.py b/fasttext/model.py index 03040b6..98b8ae6 100644 --- a/fasttext/model.py +++ b/fasttext/model.py @@ -38,7 +38,7 @@ def cosine_similarity(self, first_word, second_word): return cosine_sim class SupervisedModel(object): - def __init__(self, model, labels): + def __init__(self, model, labels, label_prefix): self._model = model self.labels = labels self.dim = model.dim; @@ -54,7 +54,15 @@ def __init__(self, model, labels): self.maxn = model.maxn; self.lr_update_rate = model.lrUpdateRate; self.t = model.t; + self.label_prefix = label_prefix; def test(self, test_file): return self._model.classifier_test(test_file) + def predict(self, texts): + labels = [] + for text in texts: + label = self._model.classifier_predict(text) + labels.append(label.replace(self.label_prefix, '')) + return labels + From ceff8e3ac3fe04fc3699148d94136b71b4d9247f Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sun, 14 Aug 2016 21:19:56 +0700 Subject: [PATCH 025/109] Add test case for classifier.predict(texts) interface --- .gitignore | 1 + Makefile | 9 ++- test/supervised_pred_test.txt | 100 ++++++++++++++++++++++++++++++++++ test/supervised_test.py | 37 +++++++++++-- 4 files changed, 142 insertions(+), 5 deletions(-) create mode 100644 test/supervised_pred_test.txt diff --git a/.gitignore b/.gitignore index c603341..bebb67d 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ facebookresearch-fasttext-* # For test test/supervised_test_result.txt +test/supervised_pred_result.txt diff --git a/Makefile b/Makefile index 9d04fa6..de31aa4 100644 --- a/Makefile +++ b/Makefile @@ -49,6 +49,13 @@ test/supervised_test_result.txt: test/supervised_params_test.bin ./fasttext/cpp/fasttext test test/supervised_params_test.bin \ test/supervised_params_test.txt > test/supervised_test_result.txt +test/supervised_pred_result.txt: test/supervised_params_test.bin + ./fasttext/cpp/fasttext predict test/supervised_params_test.bin \ + test/supervised_pred_test.txt > \ + test/supervised_pred_result.txt + test-supervised: fasttext/cpp/fasttext test/supervised_params_test.bin \ - test/supervised_test_result.txt + test/supervised_test_result.txt \ + test/supervised_pred_result.txt python test/supervised_test.py --verbose + diff --git a/test/supervised_pred_test.txt b/test/supervised_pred_test.txt new file mode 100644 index 0000000..10086e1 --- /dev/null +++ b/test/supervised_pred_test.txt @@ -0,0 +1,100 @@ +birchas chaim , yeshiva birchas chaim is a orthodox jewish mesivta high school in lakewood township new jersey . it was founded by rabbi shmuel zalmen stein in 2001 after his father rabbi chaim stein asked him to open a branch of telshe yeshiva in lakewood . as of the 2009-10 school year the school had an enrollment of 76 students and 6 . 6 classroom teachers ( on a fte basis ) for a student–teacher ratio of 11 . 5 1 . +motor torpedo boat pt-41 , motor torpedo boat pt-41 was a pt-20-class motor torpedo boat of the united states navy built by the electric launch company of bayonne new jersey . the boat was laid down as motor boat submarine chaser ptc-21 but was reclassified as pt-41 prior to its launch on 8 july 1941 and was completed on 23 july 1941 . +passiflora picturata , passiflora picturata is a species of passion flower in the passifloraceae family . +naya din nai raat , naya din nai raat is a 1974 bollywood drama film directed by a . bhimsingh . the film is famous as sanjeev kumar reprised the nine-role epic performance by sivaji ganesan in navarathri ( 1964 ) which was also previously reprised by akkineni nageswara rao in navarathri ( telugu 1966 ) . this film had enhanced his status and reputation as an actor in hindi cinema . +copiapoa coquimbana , copiapoa coquimbana is a species of clump-forming cactus native to south america . the plant bears 3 cm ( 1 . 2 in ) long yellow flowers in summer and grows up to 60 cm ( 2 ft ) high and 1 m ( 3 ft ) across . the species is named after the city of coquimbo in chile . variations include c . coquimbana var . fiedleriana c . coquimbana var . wagenknechtii c . coquimbana var . vallenarensis and c . coquimbana subsp . andina . +lester holmes , lester holmes ( born september 27 1969 in tylertown mississippi ) is a former american football offensive lineman in the national football league . he played college football at jackson state university and was drafted in the first round of the 1993 nfl draft . +metro manila ( film ) , metro manila is 2013 british-filipino independently produced crime drama film directed by sean ellis . ellis also co-produced and co-wrote the film . the film was selected as the british entry for the best foreign language film at the 86th academy awards but it was not nominated . +seafield convent grammar school , the seafield convent of the sacred heart of mary ( 1908–1976 ) was a roman catholic convent school for girls run by the religious of the sacred heart of mary founded in bootle england in 1872 . the school soon moved to siefield house in seaforth which gave it the name by which it is best known . the school moved to liverpool road crosby in 1905 . it merged with st bede ' s secondary modern in 1976 to form sacred heart catholic college . +all hallows catholic school , all hallows catholic school is a mixed voluntary-aided comprehensive secondary school and sixth form in weybourne farnham surrey england . the school offers many courses including a-level and btec business . as of 2014 gcse examination results are in the top quintile . +clarkeulia simera , clarkeulia simera is a species of moth of the tortricidae family . it is found in brazil ( santa catarina ) . +marko anttila , marko anttila ( born may 25 1985 in lempäälä finland ) is a professional ice hockey player . anttila currently plays for metallurg novokuznetsk of the khl and was drafted by chicago blackhawks in 2004 260th overall pick . +suck out the poison , suck out the poison is he is legend ' s second full-length studio album and was released on october 3 2006 . it was produced and mixed by steve evetts . it received mixed reviews from long-time fans due to a major difference in vocalist schuylar croom ' s vocals . the faq section of their website addresses this +legend of the lost , legend of the lost is a 1957 italian-american adventure film produced and directed by henry hathaway shot in technirama by jack cardiff and starring john wayne sophia loren and rossano brazzi . the location shooting for the film took place near tripoli libya . +a flea in her ear , a flea in her ear ( in french la puce à l ' oreille ) is a play by georges feydeau written in 1907 at the height of the belle Époque . +hms britomart ( j22 ) , hms britomart was a halcyon-class minesweeper of the royal navy . she served during the second world war and was sunk in 1944 in a friendly fire incident . +jean galbert de campistron , jean galbert de campistron ( 1656 – 11 may 1723 ) was a french dramatist +jim rubens , jim rubens is an american politician from the state of new hampshire . a member of the republican party rubens served in the new hampshire senate for two terms . +chuck lukacs , chuck lukacs is an artist whose work has appeared in role-playing games . +tulsi chauda , tulsi chauda is a village development committee in dhanusa district in the janakpur zone of south-eastern nepal . at the time of the 1991 nepal census it had a population of 3451 . +rodowo pomeranian voivodeship , rodowo [rɔˈdɔvɔ] is a village in the administrative district of gmina prabuty within kwidzyn county pomeranian voivodeship in northern poland . it lies approximately 9 kilometres ( 6 mi ) north of prabuty 22 km ( 14 mi ) northeast of kwidzyn and 71 km ( 44 mi ) southeast of the regional capital gdańsk . before 1945 the area was part of germany . for the history of the region see history of pomerania . the village has a population of 200 . +ken hardwick , kenneth ken hardwick ( 6 january 1931 – 4 june 1977 ) was an english footballer who played as a goalkeeper for doncaster rovers scunthorpe united and barrow . +ss friedrich bischoff , friedrich bischoff was a 1998 grt cargo ship that was built in 1940 by lübecker maschinenbau gesellschaft lübeck germany for german owners . she was sunk in an air raid in 1943 but later salvaged and returned to service . she was seized by the allies in april 1945 passed to the ministry of war transport ( mowt ) and renamed empire consequence . in 1947 she was transferred to the united states and was sold into merchant service the following year . +kurdeh larestan , kurdeh ( persian كورده‎ also romanized as kūrdeh also known as kūrdeh-e lār and kur-deh-lâr ) is a village in dehkuyeh rural district in the central district of larestan county fars province iran . at the 2006 census its population was 2927 in 698 families . +white-browed bulbul , the white-browed bulbul ( pycnonotus luteolus ) is a member of the bulbul family of passerine birds . it is a resident breeder in sri lanka and peninsular india . largely olive coloured above with whitish underparts it has a pale supercilium and a yellow vent . they are found in dense scrub habitats where they skulk within vegetation and can be difficult to see although their loud and distinct burst of calls is distinctive . +ruby winters , ruby winters is an american female soul singer whose records made the singles charts in both the us and uk in the 1960s and 1970s . +sergio momesso , sergio francesco momesso ( born september 4 1965 in montreal quebec ) is a retired professional ice hockey player who spent 13 seasons in the national hockey league between 1983 and 1997 . +aquiles lake , laguna Áquiles is a lake in the beni department bolivia . at an elevation of 224 m its surface area is 19 km² . +michael p . mcauliffe , michael p . mcauliffe is a republican member of the illinois house of representatives representing the 20th district in northwestern chicago . he has served since 1997 when he was elected following the death of his father illinois state representative roger mcauliffe . he is presently the only elected republican representing a significant portion of chicago above the county level . in much of his district he is the only elected republican at any level . +pinjarra senior high school , pinjarra senior high school is a comprehensive public high school located in pinjarra a regional centre 86 kilometres ( 53 mi ) east of perth western australia . +ted irvine , edward amos ted irvine ( born december 8 1944 ) is a canadian retired nhl hockey player . +canadian national depot , canadian national depot or warroad depot is a former railway station for the canadian national railway . the building now serves as an office for the city of warroad in the u . s . state of minnesota . +the sandstorm , the sandstorm is a history play written by playwright and iraq war veteran sean huze . it was originally an eleven character play ( los angeles 2004–2005 ) . it later became a ten character play after revisions for the east coast debut in washington d . c . ( 2005 ) . the play premiered in los angeles ca in september 2004 for a limited engagement directed by marlon hoffman at gardner stages a small 30 seat basement theater in hollywood ca . +barbie in a mermaid tale 2 , barbie in a mermaid tale 2 is a sequel to the 2010 barbie film barbie in a mermaid tale . it was released in theatres february 2012 and on dvd february 27 2012 in uk and march 6 2012 in us . the film also marks the return of kelly sheridan as the voice of barbie since barbie in a mermaid tale . +nicola valley institute of technology , nicola valley institute of technology ( nvit ) is british columbia’s aboriginal public post-secondary institute in merritt british columbia canada . it started in 1983 . +un soir au club ( novel ) , un soir au club is a novel by christian gailly published on 7 january 2001 by éditions de minuit which won the prix du livre inter prize the next year . the novel was adapted for the screen and became the 2009 film un soir au club directed by jean achache . +schefflera hierniana , schefflera hierniana is a species of plant in the araliaceae family . it is found in cameroon and equatorial guinea . its natural habitats are subtropical or tropical moist lowland forests and subtropical or tropical moist montane forests . it is threatened by habitat loss . +the planter ' s northern bride , the planter ' s northern bride is an 1854 novel written by caroline lee hentz in response to the publication of uncle tom ' s cabin by harriet beecher stowe in 1852 . +davenport university , davenport university is a private non-profit multi-location university located at 11 campuses throughout michigan and online . it was founded in 1866 by conrad swensburg and currently offers associate ' s bachelor ' s and master ' s degrees diplomas and post-grad certification programs in business technology health professions and graduate studies ( mba ) . davenport ' s w . a . lettinga main campus is located outside of grand rapids michigan . +virata corporation , virata corporation is an inactive acquired company that was a major contributor to the cambridge phenomenon or silicon fen high-tech cluster in the united kingdom . case studies and research papers have been created to illustrate the role of social networking in the creation of virata ' s success . there is also research available on the role the company played in silicon valley venture networks . +puerto escondido ( film ) , puerto escondido is a 1992 italian comedy film directed by gabriele salvatores . for this film diego abatantuono and renato carpentieri were awarded with silver ribbons for best actor and best supporting actor . +mączno , mączno [ˈmɔnt͡ʂnɔ] is a village in the administrative district of gmina będzino within koszalin county west pomeranian voivodeship in north-western poland . it lies approximately 2 kilometres ( 1 mi ) south-east of będzino 12 km ( 7 mi ) west of koszalin and 129 km ( 80 mi ) north-east of the regional capital szczecin . before 1945 the area was part of germany . for the history of the region see history of pomerania . +regional institute of paramedical and nursing aizawl , regional institute of paramedical and nursing sciences ( ripans ) is located at zemabawk on the slope of a hill generously donated by the government of mizoram . +monte scheinblum , monte scheinblum ( born may 15 1967 ) is an american professional golfer and the son of former major league baseball all star outfielder richie scheinblum . while he competed on the nike tour he became known especially for his long driving where success is achieved by a golfer hitting a golf ball the farthest . in 1992 he won the national long driving championship in the united states and was the world long driving champion . +coral museum , the coral museum ( chinese 珊瑚法界博物館 pinyin shānhú fǎjiè bówùguǎn ) is a museum in su-ao township yilan county taiwan . +the big necessity , the big necessity the unmentionable world of human waste and why it matters ( published in the united kingdom as the big necessity adventures in the world of human waste ) written by rose george is a descriptive representation of the history advancement cultural variation solutions and international need of sanitation . this work written for the purpose of global awareness of sanitation highlights the current state of a global crisis . +hannah banana bread company , hannah banana bread company is an american baked goods company founded in 2000 to provide stuff to specialty food retailers and consumers . the company ' s baked goods are supposedly derived from family recipes and contain no additives or preservatives . [citation needed] all products are named after family members and distributed to a wide variety of food service and retail customers on a wholesale basis as well as direct to consumers through the company ' s website . +chalinga river , the chalinga river is a river of chile . +i am curious ( yellow ) , i am curious ( yellow ) ( swedish jag är nyfiken – en film i gult meaning i am curious a film in yellow ) is a 1967 swedish drama film written and directed by vilgot sjöman and starring sjöman and lena nyman . it is a companion film to 1968 ' s i am curious ( blue ) the two were initially intended to be one 3½ hour film . the films are named after the colours of the swedish flag . +salem state university , salem state university is a four-year public university located in salem massachusetts . salem state university established in 1854 as salem normal school is located approximately fifteen miles north of boston massachusetts . salem state enrolls over 10000 undergraduate and graduate students from 27 states and 57 foreign countries . from 1968 to 2010 the institution was named salem state college . as of 2010 salem state enrolled 5894 undergraduate and 343 graduate full-time students . +blindsight ( cook novel ) , blindsight is a novel by robin cook . like most of cook ' s other work it is a medical thriller . this story introduces new york city pathologist laurie montgomery as being new to the medical examiner ' s office . she uncovers a series of drug overdoses and gangland-style murders with a grisly twist . an abnormal increase in the number of drug overdose cases makes laurie seriously suspicious and she starts investigating these cases . +marcel diallo , marcel diallo is an american musician poet artist and community builder known for his founding of the black dot artists collective ( later black dot artists inc . ) the black new world and his revitalization efforts in west oakland ' s historic predominantly african-american prescott neighborhood aka the lower bottoms . +oscar-class submarine , project 949 ( granit ) and project 949a ( antey ) are soviet navy/russian navy cruise missile submarines ( nato reporting names oscar-i and oscar-ii respectively ) . project 949 submarines were the largest cruise missile submarines in service until the ohio-class ssgn cruise missile submarine converted from ssbn and returned to service on october 15 2007 . they are the fourth largest class of submarines in terms of displacement and length . +realschule hirschaid , realschule hirschaid is a realschule in the town of hirschaid bamberg germany . it stands between the autobahn and the rhine–main–danube canal . +british rail class 332 , british rail class 332 electric multiple units are used by heathrow express between london paddington and heathrow airport . +george herbert palmer , george herbert palmer ( march 9 1842 – may 8 1933 ) was an american scholar and author born in boston . he attended phillips academy andover and in 1864 he graduated at harvard to which he returned after study at tübingen germany and at andover theological seminary to be tutor in greek . he became alford professor of natural religion moral philosophy and civil polity at harvard ( 1889–1913 ) . in 1887 he married as his second wife alice freeman palmer . +rythm syndicate ( album ) , rythm syndicate is the full-length debut album released by the dance-rock band rythm syndicate the group formed by songwriter-producers carl sturken and evan rogers . it was released in 1991 on impact a subsidiary of mca and produced ( and mostly written ) by sturken/rogers . three singles were released p . a . s . s . i . o . n . charted on the billboard hot 100 peaking at number 3 while the opening track hey donna peaked just outside the top 10 reaching number 13 . +uss viking ( sp-3314 ) , the second uss viking ( sp-3314 ) was a united states navy patrol vessel in service from 1918 to 1919 . viking was built in 1915 as the motorboat caesar by the u . s . navy and sold to civilian civilian owner soon afterwards without seeing any naval service . caesar later was renamed viking while under private ownership . she operated at norfolk virginia while in private use . on 5 september 1918 the u . s . +samo kukovica , samo kukovica ( born february 2 1989 in brežice sfr yugoslavia ) is a slovenian motorcycle speedway rider . +the guardian ( belize ) , the guardian is a belizean newspaper and the official print organ of the united democratic party . it is published on thursdays and sells for bz$1 . 00 . +harald hasselbach , harald hasselbach ( born september 22 1967 ) is a former defensive end who played 7 seasons in the nfl for the denver broncos . he played for the broncos from 1994 to 2000 and was a starter in super bowl xxxiii . previously he played four seasons for the calgary stampeders of the cfl and also played in a grey cup winning team . +nutty professor ii the klumps , nutty professor ii the klumps is a 2000 science fiction romantic comedy film directed by peter segal . it is a sequel to the 1996 film the nutty professor and stars eddie murphy . like in the first one murphy plays not only the inept but brilliant scientist sherman klump but also ( wearing different but equally elaborate makeup ) most of sherman ' s family as well . +rmit school of management , rmit ' s school of management is an australian tertiary education school within the college of business at the royal melbourne institute of technology ( rmit university ) located in melbourne victoria . it is one of the best business schools in australia in terms of students ' satisfaction staff-student ration and research and publication from the academic staff members . [citation needed] +scrooge ( 1951 film ) , scrooge is a 1951 film adaptation of charles dickens ' s a christmas carol . it starred alastair sim as ebenezer scrooge and was directed by brian desmond hurst with a screenplay by noel langley . it was released as a christmas carol in the united states . the film also features kathleen harrison in an acclaimed turn as mrs . dilber scrooge ' s charwoman . george cole stars as the younger version of scrooge hermione baddeley as mrs . +superman secret origin , superman secret origin was a six-issue monthly american comic book limited series written by geoff johns and illustrated by gary frank starring the dc comics superhero superman . the story featured thedefinitive origin of superman for the modern post-infinite crisis dc universe continuity starting with clark kent in his pre-teens as superboy . +projapyx , projapyx is a genus of diplurans in the family projapygidae . +something wicked this way comes ( novel ) , something wicked this way comes is a 1962 novel by ray bradbury . it is about two 14-year-old boys jim nightshade and william halloway who have a harrowing experience with a nightmarish and bewitching traveling carnival that comes to their midwestern town on one october before halloween . the carnival ' s leader is the mysterious mr . dark who bears a tattoo for each person who lured by the offer to live out his secret fantasies has become bound in service to the carnival . mr . +dutluk taşova , dutluk is a village in the district of taşova amasya province turkey . +roger bedford jr . , roger h . bedford jr . born july 7 1956 is a democratic member of the alabama senate representing the 6th district since 1994 . he previously served from 1982 through 1990 . bedford received his education at the university of alabama and his law degree from cumberland school of law samford university . +pseudoacanthocereus , pseudoacanthocereus is a genus of cactus . +calvin christian school ( escondido california ) , calvin christian school is a private christian school in escondido california . it consists of a preschool elementary school junior high and high school . +the emperor of all maladies , the emperor of all maladies a biography of cancer is a book written by siddhartha mukherjee an indian-born american physician and oncologist . published on november 16 2010 by scribner it won the 2011 pulitzer prize for general nonfiction the jury called it an elegant inquiry at once clinical and personal . the book weaves together mukherjee ' s experiences as a hematology/oncology fellow at massachusetts general hospital as well as the history of cancer treatment and research . +mark hutchison , mark a . hutchison ( born may 51963 in las vegas nevada ) is an american attorney in private practice and a nevada politician . he was elected to the nevada state senate on november 6 2012 to represent senate district 6 which encompasses the northwest part of the las vegas valley including portions of the communities of summerlin desert shores and sun city . he is a member of the republican party . +steve stone ( footballer ) , steven brian steve stone ( born 20 august 1971 in gateshead tyne and wear ) is an english former footballer and now part of the coaching staff at newcastle united . +rudolf diels , rudolf diels ( 16 december 1900 - 18 november 1957 ) was a german politician and head of the gestapo in 1933-34 . he is also referred to as an ss-oberführer . he was a protégé of hermann göring . +nephilim ( manga ) , nephilim ( ネフィリム nefirimu ) is a shōjo manga by anna hanamaki . it was serialized in nemurenu yoru no kimyō na hanashi from 2004 to 2005 with the individual chapters published in two tankōbon volumes by asahi sonorama . it is licensed for an english language release in the united states by aurora publishing which began releasing the series in april 2008 . +cerge-ei , cerge-ei /sɜrdʒ . iː . aɪ/ is an academic institution that provides an american-style phd program in economics a us-chartered master of arts program in applied economics ( the mae program ) and the upces study abroad program . cerge-ei also conducts research in theoretical and policy-related economics . the institution is recognized as one of the top economics institutes in europe . the cerge-ei acronym stands for center for economic research and graduate education – economics institute . +ron carter ( basketball ) , ronald ron carter jr . ( born august 31 1956 in pittsburgh pennsylvania ) is a retired american basketball player . he played collegiately for the virginia military institute . he was a guard . carter was selected by the los angeles lakers in the 2nd round ( 26th pick overall ) of the 1978 nba draft . he played for the lakers ( 1978–79 ) and indiana pacers ( 1979–80 ) in the nba for 59 games . carter has 4 children 2 sons ronald carter iii and paul m . carter and 2 daughters bria a . carter and brooke a . +nishada sambara , nishada sambara is a moth of the arctiidae family . it is found on sumatra borneo java the sangihe islands bali and the philippines . the habitat consists of lowland forests . +annenberg school for communication at the university of pennsylvania , there are multiple annenberg schools . for the communications school at usc see usc annenberg school for communication . see also annenberg ( disambiguation ) . the annenberg school for communication is the communication school at the university of pennsylvania . the school was established in 1958 by wharton school ' s alum walter annenberg as the annenberg school of communications . the name was changed to its current title in the late 1980s . +hanriot hd . 8 , the hanriot hd . 8 was a short-lived french fighter prototype of the 1910s . +stanley elbers , stanley elbers ( born 14 may 1992 ) in the netherlands is a dutch football ( soccer ) player who plays as a striker . he currently plays for helmond sport in the eerste divisie . +brzyszewo , brzyszewo [bʐɨˈʂɛvɔ] ( german birken ) is a village in the administrative district of gmina chodecz within włocławek county kuyavian-pomeranian voivodeship in north-central poland . it lies approximately 4 kilometres ( 2 mi ) north of chodecz 25 km ( 16 mi ) south of włocławek and 73 km ( 45 mi ) south-east of toruń . +ed manion , ed manion ( born february 28 1952 ) also known as eddie kingfish manion is an american saxophonist who plays both tenor and baritone sax . he is best known as a member of bruce springsteen with the seeger sessions band tour later called bruce springsteen with the sessions band with the release of the cd/dvd bruce springsteen with the sessions band live in dublin in 2006 . +nancy ( musician ) , nazmun munir nancy is a bangladeshi singer . in 2006 nancy has got married with a businessman and has a three-year old baby girl named rodela . +turks and caicos national museum , the turks and caicos national museum is the national museum of the turks and caicos islands . it is located in guinep house on front street to the north of cockburn town on grand turk island which is also the capital of the archipelago . established in the 1980s and opened in 1991 the museum is publicly funded as a nonprofit trust . it exhibits pre-historic lucayan culture and records the history of the islands of the colonial era and the slave trade all related to the sea . +fauske church , fauske church ( norwegian fauske kirke ) is a parish church in the municipality of fauske in nordland county norway . it is located in the town of fauske . the church is part of the fauske parish in the salten deanery in the diocese of sør-hålogaland . the white wooden church was built in 1867 and it seats about 280 people . +lavochkin la-160 , the lavochkin la-160 known as strelka ( arrow ) was the first soviet swept winged jet fighter research prototype . it was designed and manufactured by the lavochkin design bureau from 1946 . usaf reporting name - type 6 +vasily vakhrushev , vasiliy vasilyevich vakhrushev ( russian Васи́лий Васи́льевич Ва́хрушев tula russian empire 15 february [o . s . 28 february] 1902 – moscow 13 january 1947 ) was a soviet-russian statesman who was from 1939 to 1940 the chairman of the council of people ' s commissars of the russian sfsr literally meaning premier or prime minister . +william d . burns , william d . burns is a member of the chicago city council representing chicago ' s 4th ward . a member of the democratic party burns represented the 26th district in the illinois house of representatives from 2008 through 2011 . after winning election as an alderman on february 22 2011 he stepped down from his role as a state legislator . +mitchel range , the mitchel range is a mountain range in san bernardino county california . +esena foundation high school , esena foundation high school or esena is a private fee-paying academic institution for girls only . it is located in gulberg lahore punjab pakistan . established in 1964 esena is the very first private education institute for girls in pakistan . esena ' s director begum majid was a very learned lady . she was the daughter of the late imam jafer the chief justice of india . [citation needed] +maykor , maykora russian nationwide provider of it and business processes outsourcing . the company is focused on comprehensive servicing of it equipment building utility systems and business applications . servicing constitutes 99 . 9% in the company’s total revenues . its own chain of 83 branches and 400 local business units spreads across russia enabling maykor to render one-stop-shop services . company ' s staff includes more than 4000 certified engineers . +conus paulae , conus paulae is a species of sea snail a marine gastropod mollusk in the family conidae the cone snails and their allies . like all species within the genus conus these snails are predatory and venomous . they are capable of stinging humans therefore live ones should be handled carefully or not at all . +andor lázár , andor lázár ( 8 march 1882 – 12 june 1971 ) was a hungarian politician and jurist who served as minister of justice between 1932 and 1938 . he was born into a hungarian calvinist family of noble origin in pápa . he learnt at the calvinist college of pápa and finished law studies in budapest . during his field trips he visited most of the countries of europe but he also went to canada and the united states . +re-foc , re-foc is the first widely available album by mexican guitar duo rodrigo y gabriela released in 2002 . some songs are re-recorded versions of those that appeared on foc while others were written for this album . +palicourea canarina , palicourea canarina is a species of plant in the rubiaceae family . it is endemic to ecuador . +atzenberger höhe , atzenberger höhe is a mountain of baden-württemberg germany . +kensington university , kensington university was an unaccredited distance education institution that was based at different times in hawaii and california . it was eventually shut down by state authorities in both states . +simone james , simone james is a british actress best known for her role as becca swanson in television soap opera eastenders . +engineering division xco-6 , the engineering division xco-6 was an american two-seat observation biplane designed by the united states army engineering division only two were built and the type did not enter production . diff --git a/test/supervised_test.py b/test/supervised_test.py index 20e0068..928da34 100644 --- a/test/supervised_test.py +++ b/test/supervised_test.py @@ -11,11 +11,13 @@ supervised_file = path.join(path.dirname(__file__), 'supervised_params_test.bin') input_file = path.join(path.dirname(__file__), 'supervised_params_test.txt') +pred_file = path.join(path.dirname(__file__), 'supervised_pred_test.txt') output = path.join(path.dirname(__file__), 'generated_supervised') test_result = path.join(path.dirname(__file__), 'supervised_test_result.txt') +pred_result = path.join(path.dirname(__file__), 'supervised_pred_result.txt') test_file = input_file # Only for test -def read_labels(filename, label_prefix): +def read_labels(filename, label_prefix, unique=True): labels = [] with open(filename, 'r') as f: for line in f: @@ -31,10 +33,13 @@ def read_labels(filename, label_prefix): except: line = line - label = line.split(',')[0].strip() + label = line.split(',', 1)[0].strip() label = label.replace(label_prefix, '') - if label in labels: - continue + if unique: + if label in labels: + continue + else: + labels.append(label) else: labels.append(label) return labels @@ -111,5 +116,29 @@ def test_classifier_test(self): self.assertEqual(p_at_1, precision_at_one) self.assertEqual(num_ex, num_examples) + def test_classifier_predict(self): + label_prefix = '__label__' + # Load the pre-trained classifier + classifier = ft.load_model(supervised_file, label_prefix=label_prefix) + + # Read texts from the pred_file, prediction made by fasttext(1) + texts = [] + with open(pred_file, 'r') as f: + for line in f: + try: + line = line.decode('utf-8') + except: + line = line + texts.append(line) + + # Predict the labels + fasttext_labels = read_labels(pred_result, label_prefix=label_prefix, + unique=False) + labels = classifier.predict(texts) + + # Make sure the returned labels are the same as predicted by + # fasttext(1) + self.assertTrue(labels == fasttext_labels) + if __name__ == '__main__': unittest.main() From 9b3355de3bfa22e3fb209a722c8f3636d77ab843 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sun, 14 Aug 2016 21:29:11 +0700 Subject: [PATCH 026/109] Add documentation of classifier.predict(texts) interface --- README.md | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 160e62f..d8ab67a 100644 --- a/README.md +++ b/README.md @@ -145,14 +145,24 @@ This will print the same output to stdout as: ```shell ./fasttext test model.bin test.txt ``` +In order to obtain the most likely label for a list of text, we can +use `classifer.predict` method: -TODO: add predict method +```python +texts = ['example very long text 1', 'example very longtext 2'] +labels = classifier.predict(texts) +print labels +``` + +This interface is equivalent as `fasttext(1)` predict command. The same model +with the same input set will have the same prediction. ## API documentation ### Skipgram model Train & load skipgram model + ```python model = fasttext.skipgram(params) ``` @@ -166,6 +176,7 @@ model = fasttext.cbow(params) ``` ### Load pre-trained model + File `.bin` that previously trained or generated by fastText can be loaded using this function @@ -174,6 +185,7 @@ model = fasttext.load_model('model.bin') ``` ### Attributes and methods for the model + Skipgram and CBOW model have the following atributes & methods ```python @@ -205,6 +217,7 @@ classifier = fasttext.supervised(params) ### Load pre-trained classifier + File `.bin` that previously trained or generated by fastText can be loaded using this function. @@ -217,6 +230,7 @@ classifier = fasttext.load_model('classifier.bin', label_prefix='some_prefix') ``` ### Test classifier + This is equivalent as `fasttext(1)` test command. The test using the same model and test set will produce the same value for the precision at one and the number of examples. @@ -225,12 +239,24 @@ and the number of examples. precision_at_one, nexamples = classifier.test(test_file) ``` +### Predict the most-likely label of texts + +This interface is equivalent as `fasttext(1)` predict command. + +`texts` is an array of string + +```python +labels = classifier.predict(texts) + +``` + ### Attributes and methods for the classifier Classifier have the following atributes & methods ```python classifier.labels # List of labels +classifier.label_prefix # Prefix of the label classifier.dim # Size of word vector classifier.ws # Size of context window classifier.epoch # Number of epochs @@ -243,12 +269,10 @@ classifier.minn # Min length of char ngram classifier.maxn # Max length of char ngram classifier.lr_update_rate # Rate of updates for the learning rate classifier.t # Value of sampling threshold -clsasifier.test(filename) # Test the classifier +classifier.test(filename) # Test the classifier +classifier.predict(texts) # Predict the most likely label ``` -TODO: add classifier method here - - ### Params List of available `params` and their default value: From d9effd80e775f3c5eef7fa33024627ea7049f01b Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sun, 14 Aug 2016 21:41:09 +0700 Subject: [PATCH 027/109] Update package version from v0.5.19 to v0.6.0 --- setup.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 90a09fe..95fcaaa 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ # Package details setup( name='fasttext', - version='0.5.19', + version='0.6.0', author='Bayu Aldi Yansyah', author_email='bayualdiyansyah@gmail.com', url='https://github.com/pyk/fastText.py', @@ -32,8 +32,7 @@ packages=['fasttext'], ext_modules = cythonize(extensions), install_requires=[ - 'numpy>=1', - 'future' + 'numpy>=1' ], classifiers= [ 'Development Status :: 2 - Pre-Alpha', From 1f92f0849854f2a489ec60cbaebcf49129ff9bd1 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sun, 14 Aug 2016 21:59:47 +0700 Subject: [PATCH 028/109] Fix dependencies on build --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 95fcaaa..5283432 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,8 @@ packages=['fasttext'], ext_modules = cythonize(extensions), install_requires=[ - 'numpy>=1' + 'numpy>=1', + 'future' ], classifiers= [ 'Development Status :: 2 - Pre-Alpha', From 9ec82a840c9410b9d3620c166a44735ec1d45ab6 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 15 Aug 2016 10:24:55 +0700 Subject: [PATCH 029/109] Add CONTRIBUTING.md --- CONTRIBUTING.md | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..a610bfd --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,44 @@ +# How to contribute + +We definitely welcome patches and contribution to fastText.py! + +Here are some guidelines and information about how to do so. + +## Sending patches + +### Getting started + +1. Check out the code: + + $ https://github.com/salestock/fastText.py.git + $ pip install -r requirements.txt + +1. Create a fork of the fastText.py repository. +1. Add your fork as a remote: + + $ git remote add fork git@github.com:$YOURGITHUBUSERNAME/fastText.py.git + +1. Make changes, commit them. +1. Run the test suite: + + $ make install-dev + $ make test + +1. Push your changes to your fork: + + $ git push fork ... + +1. Open a pull request. + +## Filing Issues +When filing an issue, make sure to answer these five questions: + +1. What version of Python are you using (`python --version`)? +2. What operating system and processor architecture are you using? +3. What did you do? +4. What did you expect to see? +5. What did you see instead? + +### Contributing code +Unless otherwise noted, the fastText.py source files are distributed under +the BSD-style license found in the LICENSE file. From 955bf8e61e44d4ff26148aa178f14ceecdbd3481 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 15 Aug 2016 10:44:55 +0700 Subject: [PATCH 030/109] Add long_description in reStructuredText format --- Makefile | 8 ++ README.rst | 370 +++++++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 3 +- 3 files changed, 380 insertions(+), 1 deletion(-) create mode 100644 README.rst diff --git a/Makefile b/Makefile index de31aa4..98e8f3c 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,14 @@ install: python setup.py install .PHONY: install +# Install the pandoc(1) first to run this command +# sudo apt-get install pandoc +README.rst: README.md + pandoc --from=markdown --to=rst --output=README.rst README.md + +upload: README.rst + python setup.py sdist upload + install-dev: python setup.py develop .PHONY: install-dev diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..8184104 --- /dev/null +++ b/README.rst @@ -0,0 +1,370 @@ +fasttext |Build Status| |PyPI version| +====================================== + +fasttext is a Python interface for `Facebook +fastText `__. + +Requirements +------------ + +fasttext support Python 2.6 or newer. It requires +`Cython `__ in order to build the +C++ extension. + +Installation +------------ + +.. code:: shell + + pip install fasttext + +Example usage +------------- + +This package has two main use cases: word representation learning and +text classification. + +These were described in the two papers +`1 <#enriching-word-vectors-with-subword-information>`__ and +`2 <#bag-of-tricks-for-efficient-text-classification>`__. + +Word representation learning +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In order to learn word vectors, as described in +`1 <#enriching-word-vectors-with-subword-information>`__, we can use +``fasttext.skipgram`` and ``fasttext.cbow`` function like the following: + +.. code:: python + + import fasttext + + # Skipgram model + model = fasttext.skipgram('data.txt', 'model') + print model.words # list of words in dictionary + + # CBOW model + model = fasttext.cbow('data.txt', 'model') + print model.words # list of words in dictionary + +where ``data.txt`` is a training file containing ``utf-8`` encoded text. +By default the word vectors will take into account character n-grams +from 3 to 6 characters. + +At the end of optimization the program will save two files: +``model.bin`` and ``model.vec``. + +``model.vec`` is a text file containing the word vectors, one per line. +``model.bin`` is a binary file containing the parameters of the model +along with the dictionary and all hyper parameters. + +The binary file can be used later to compute word vectors or to restart +the optimization. + +The following ``fasttext(1)`` command is equivalent + +.. code:: shell + + # Skipgram model + ./fasttext skipgram -input data.txt -output model + + # CBOW model + ./fasttext cbow -input data.txt -output model + +Obtaining word vectors for out-of-vocabulary words +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The previously trained model can be used to compute word vectors for +out-of-vocabulary words. + +.. code:: python + + print model.get_vector('king') + # or just use a nice syntax + print model['king'] # get the vector of the word 'king' + +the following ``fasttext(1)`` command is equivalent: + +.. code:: shell + + echo "king" | ./fasttext print-vectors model.bin + +This will output the vector of word ``king`` to the standard output. + +Load pre-trained model +~~~~~~~~~~~~~~~~~~~~~~ + +We can use ``fasttext.load_model`` to load pre-trained model: + +.. code:: python + + model = fasttext.load_model('model.bin') + print model.words # list of words in dictionary + print model['king'] # get the vector of the word 'king' + +Text classification +~~~~~~~~~~~~~~~~~~~ + +This package can also be used to train supervised text classifiers and +load pre-trained classifier from fastText. + +In order to train a text classifier using the method described in +`2 <#bag-of-tricks-for-efficient-text-classification>`__, we can use the +following function: + +.. code:: python + + classifier = fasttext.supervised('data.train.txt', 'model') + +equivalent as ``fasttext(1)`` command: + +.. code:: shell + + ./fasttext supervised -input data.train.txt -output model + +where ``data.train.txt`` is a text file containing a training sentence +per line along with the labels. By default, we assume that labels are +words that are prefixed by the string ``__label__``. + +We can specify the label prefix with the ``label_prefix`` param: + +.. code:: python + + classifier = fasttext.supervised('data.train.txt', 'model', label_prefix='__label__') + +equivalent as ``fasttext(1)`` command: + +.. code:: shell + + ./fasttext supervised -input data.train.txt -output model -label '__label__' + +This will output two files: ``model.bin`` and ``model.vec``. + +Once the model was trained, we can evaluate it by computing the +precision at 1 (P@1) on a test set using ``classifier.test`` function: + +.. code:: python + + precision_at_one, nexamples = classifier.test('test.txt') + print 'P@1:', precision_at_one + print 'Number of examples:', nexamples + +This will print the same output to stdout as: + +.. code:: shell + + ./fasttext test model.bin test.txt + +In order to obtain the most likely label for a list of text, we can use +``classifer.predict`` method: + +.. code:: python + + texts = ['example very long text 1', 'example very longtext 2'] + labels = classifier.predict(texts) + print labels + +This interface is equivalent as ``fasttext(1)`` predict command. The +same model with the same input set will have the same prediction. + +API documentation +----------------- + +Skipgram model +~~~~~~~~~~~~~~ + +Train & load skipgram model + +.. code:: python + + model = fasttext.skipgram(params) + +CBOW model +~~~~~~~~~~ + +Train & load CBOW model + +.. code:: python + + model = fasttext.cbow(params) + +Load pre-trained model +~~~~~~~~~~~~~~~~~~~~~~ + +File ``.bin`` that previously trained or generated by fastText can be +loaded using this function + +.. code:: python + + model = fasttext.load_model('model.bin') + +Attributes and methods for the model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Skipgram and CBOW model have the following atributes & methods + +.. code:: python + + model.model_name # Model name + model.words # List of words in the dictionary + model.dim # Size of word vector + model.ws # Size of context window + model.epoch # Number of epochs + model.min_count # Minimal number of word occurences + model.neg # Number of negative sampled + model.word_ngrams # Max length of word ngram + model.loss_name # Loss function name + model.bucket # Number of buckets + model.minn # Min length of char ngram + model.maxn # Max length of char ngram + model.lr_update_rate # Rate of updates for the learning rate + model.t # Value of sampling threshold + model.get_vector(word) # Get the vector of specified word + model[word] # Get the vector of specified word + +Supervised model +~~~~~~~~~~~~~~~~ + +Train & load the classifier + +.. code:: python + + classifier = fasttext.supervised(params) + +Load pre-trained classifier +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +File ``.bin`` that previously trained or generated by fastText can be +loaded using this function. + +.. code:: shell + + ./fasttext supervised -input train.txt -output classifier -label 'some_prefix' + +.. code:: python + + classifier = fasttext.load_model('classifier.bin', label_prefix='some_prefix') + +Test classifier +~~~~~~~~~~~~~~~ + +This is equivalent as ``fasttext(1)`` test command. The test using the +same model and test set will produce the same value for the precision at +one and the number of examples. + +.. code:: python + + precision_at_one, nexamples = classifier.test(test_file) + +Predict the most-likely label of texts +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This interface is equivalent as ``fasttext(1)`` predict command. + +``texts`` is an array of string + +.. code:: python + + labels = classifier.predict(texts) + +Attributes and methods for the classifier +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Classifier have the following atributes & methods + +.. code:: python + + classifier.labels # List of labels + classifier.label_prefix # Prefix of the label + classifier.dim # Size of word vector + classifier.ws # Size of context window + classifier.epoch # Number of epochs + classifier.min_count # Minimal number of word occurences + classifier.neg # Number of negative sampled + classifier.word_ngrams # Max length of word ngram + classifier.loss_name # Loss function name + classifier.bucket # Number of buckets + classifier.minn # Min length of char ngram + classifier.maxn # Max length of char ngram + classifier.lr_update_rate # Rate of updates for the learning rate + classifier.t # Value of sampling threshold + classifier.test(filename) # Test the classifier + classifier.predict(texts) # Predict the most likely label + +Params +~~~~~~ + +List of available ``params`` and their default value: + +:: + + For Skipgram, CBOW and Supervised model + input training file path + output output file path + lr learning rate [0.05] + lr_update_rate change the rate of updates for the learning rate [100] + dim size of word vectors [100] + ws size of the context window [5] + epoch number of epochs [5] + min_count minimal number of word occurences [1] + neg number of negatives sampled [5] + word_ngrams max length of word ngram [1] + loss loss function {ns, hs, softmax} [ns] + bucket number of buckets [2000000] + minn min length of char ngram [3] + maxn max length of char ngram [6] + thread number of threads [12] + t sampling threshold [0.0001] + silent disable the log output from the C++ extension [1] + + For Supervised model only + label_prefix Prefix of the label name [__label__] + +References +---------- + +Enriching Word Vectors with Subword Information +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +[1] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, `*Enriching Word +Vectors with Subword +Information* `__ + +:: + + @article{bojanowski2016enriching, + title={Enriching Word Vectors with Subword Information}, + author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas}, + journal={arXiv preprint arXiv:1607.04606}, + year={2016} + } + +Bag of Tricks for Efficient Text Classification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +[2] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, `*Bag of Tricks for +Efficient Text +Classification* `__ + +:: + + @article{joulin2016bag, + title={Bag of Tricks for Efficient Text Classification}, + author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas}, + journal={arXiv preprint arXiv:1607.01759}, + year={2016} + } + +(\* These authors contributed equally.) + +Join the fastText community +--------------------------- + +- Facebook page: https://www.facebook.com/groups/1174547215919768 +- Google group: + https://groups.google.com/forum/#!forum/fasttext-library + +.. |Build Status| image:: https://travis-ci.org/salestock/fastText.py.svg?branch=master + :target: https://travis-ci.org/salestock/fastText.py +.. |PyPI version| image:: https://badge.fury.io/py/fasttext.svg + :target: https://badge.fury.io/py/fasttext diff --git a/setup.py b/setup.py index 5283432..fa2e7c4 100644 --- a/setup.py +++ b/setup.py @@ -23,11 +23,12 @@ # Package details setup( name='fasttext', - version='0.6.0', + version='0.6.2', author='Bayu Aldi Yansyah', author_email='bayualdiyansyah@gmail.com', url='https://github.com/pyk/fastText.py', description='A Python interface for Facebook fastText library', + long_description=open('README.rst', 'r').read(), license='BSD 3-Clause License', packages=['fasttext'], ext_modules = cythonize(extensions), From 5470ace2cd1113dcce8e73fd5449a3d2bec17f76 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 15 Aug 2016 10:47:24 +0700 Subject: [PATCH 031/109] Update CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a610bfd..6a0933c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -10,7 +10,7 @@ Here are some guidelines and information about how to do so. 1. Check out the code: - $ https://github.com/salestock/fastText.py.git + $ git clone https://github.com/salestock/fastText.py.git $ pip install -r requirements.txt 1. Create a fork of the fastText.py repository. From 2e1ff2b2c87cd4b4277f9a437a4985586474f29b Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 15 Aug 2016 10:48:14 +0700 Subject: [PATCH 032/109] Update CONTRIBUTING.md --- CONTRIBUTING.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6a0933c..fe5ed38 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,6 +11,7 @@ Here are some guidelines and information about how to do so. 1. Check out the code: $ git clone https://github.com/salestock/fastText.py.git + $ cd fastText.py $ pip install -r requirements.txt 1. Create a fork of the fastText.py repository. From 101dc3208f252c08c9d3e48bf63fb8307fb8adc3 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Wed, 17 Aug 2016 17:57:12 +0700 Subject: [PATCH 033/109] fastText: update 9bfa32d to 86e6b44 --- fasttext/cpp/LAST_COMMIT | 2 +- fasttext/cpp/src/model.cc | 2 +- update-fasttext.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fasttext/cpp/LAST_COMMIT b/fasttext/cpp/LAST_COMMIT index 5f5aaa7..3626474 100644 --- a/fasttext/cpp/LAST_COMMIT +++ b/fasttext/cpp/LAST_COMMIT @@ -1 +1 @@ -9bfa32d +86e6b44 diff --git a/fasttext/cpp/src/model.cc b/fasttext/cpp/src/model.cc index 09f9e4c..a6de579 100644 --- a/fasttext/cpp/src/model.cc +++ b/fasttext/cpp/src/model.cc @@ -77,7 +77,7 @@ real Model::hierarchicalSoftmax(int32_t target) { real Model::softmax(int32_t target) { grad_.zero(); output_.mul(wo_, hidden_); - real max = 0.0, z = 0.0; + real max = output_[0], z = 0.0; for (int32_t i = 0; i < osz_; i++) { max = std::max(output_[i], max); } diff --git a/update-fasttext.sh b/update-fasttext.sh index 74d260c..926f3e1 100644 --- a/update-fasttext.sh +++ b/update-fasttext.sh @@ -1,4 +1,4 @@ -NEW_VERSION=9bfa32d +NEW_VERSION=86e6b44 CURRENT_VERSION=$(cat fasttext/cpp/LAST_COMMIT) if [ "$NEW_VERSION" = "$CURRENT_VERSION" ]; then From 58be5230323a2d86e45ad79d6179c77964e475a5 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 18 Aug 2016 04:28:50 +0700 Subject: [PATCH 034/109] Add fasttext version info --- CONTRIBUTING.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index fe5ed38..e446636 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -35,10 +35,11 @@ Here are some guidelines and information about how to do so. When filing an issue, make sure to answer these five questions: 1. What version of Python are you using (`python --version`)? -2. What operating system and processor architecture are you using? -3. What did you do? -4. What did you expect to see? -5. What did you see instead? +2. What version of `fasttext` are you using (`pip list | grep fasttext`)? +3. What operating system and processor architecture are you using? +4. What did you do? +5. What did you expect to see? +6. What did you see instead? ### Contributing code Unless otherwise noted, the fastText.py source files are distributed under From 6917570581d12753d3b8dd262f0e5c66a3d8baac Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 19 Aug 2016 01:26:41 +0700 Subject: [PATCH 035/109] Ignore TODO file --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index bebb67d..27beed4 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,6 @@ facebookresearch-fasttext-* # For test test/supervised_test_result.txt test/supervised_pred_result.txt + +# Misc +TODO From 1051b7b7aff69335b4f11b12ab13b3c14453c23d Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 19 Aug 2016 01:34:47 +0700 Subject: [PATCH 036/109] Ignore README.rst --- .gitignore | 1 + README.rst | 370 ----------------------------------------------------- 2 files changed, 1 insertion(+), 370 deletions(-) delete mode 100644 README.rst diff --git a/.gitignore b/.gitignore index 27beed4..176a0a3 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ test/supervised_pred_result.txt # Misc TODO +README.rst diff --git a/README.rst b/README.rst deleted file mode 100644 index 8184104..0000000 --- a/README.rst +++ /dev/null @@ -1,370 +0,0 @@ -fasttext |Build Status| |PyPI version| -====================================== - -fasttext is a Python interface for `Facebook -fastText `__. - -Requirements ------------- - -fasttext support Python 2.6 or newer. It requires -`Cython `__ in order to build the -C++ extension. - -Installation ------------- - -.. code:: shell - - pip install fasttext - -Example usage -------------- - -This package has two main use cases: word representation learning and -text classification. - -These were described in the two papers -`1 <#enriching-word-vectors-with-subword-information>`__ and -`2 <#bag-of-tricks-for-efficient-text-classification>`__. - -Word representation learning -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In order to learn word vectors, as described in -`1 <#enriching-word-vectors-with-subword-information>`__, we can use -``fasttext.skipgram`` and ``fasttext.cbow`` function like the following: - -.. code:: python - - import fasttext - - # Skipgram model - model = fasttext.skipgram('data.txt', 'model') - print model.words # list of words in dictionary - - # CBOW model - model = fasttext.cbow('data.txt', 'model') - print model.words # list of words in dictionary - -where ``data.txt`` is a training file containing ``utf-8`` encoded text. -By default the word vectors will take into account character n-grams -from 3 to 6 characters. - -At the end of optimization the program will save two files: -``model.bin`` and ``model.vec``. - -``model.vec`` is a text file containing the word vectors, one per line. -``model.bin`` is a binary file containing the parameters of the model -along with the dictionary and all hyper parameters. - -The binary file can be used later to compute word vectors or to restart -the optimization. - -The following ``fasttext(1)`` command is equivalent - -.. code:: shell - - # Skipgram model - ./fasttext skipgram -input data.txt -output model - - # CBOW model - ./fasttext cbow -input data.txt -output model - -Obtaining word vectors for out-of-vocabulary words -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The previously trained model can be used to compute word vectors for -out-of-vocabulary words. - -.. code:: python - - print model.get_vector('king') - # or just use a nice syntax - print model['king'] # get the vector of the word 'king' - -the following ``fasttext(1)`` command is equivalent: - -.. code:: shell - - echo "king" | ./fasttext print-vectors model.bin - -This will output the vector of word ``king`` to the standard output. - -Load pre-trained model -~~~~~~~~~~~~~~~~~~~~~~ - -We can use ``fasttext.load_model`` to load pre-trained model: - -.. code:: python - - model = fasttext.load_model('model.bin') - print model.words # list of words in dictionary - print model['king'] # get the vector of the word 'king' - -Text classification -~~~~~~~~~~~~~~~~~~~ - -This package can also be used to train supervised text classifiers and -load pre-trained classifier from fastText. - -In order to train a text classifier using the method described in -`2 <#bag-of-tricks-for-efficient-text-classification>`__, we can use the -following function: - -.. code:: python - - classifier = fasttext.supervised('data.train.txt', 'model') - -equivalent as ``fasttext(1)`` command: - -.. code:: shell - - ./fasttext supervised -input data.train.txt -output model - -where ``data.train.txt`` is a text file containing a training sentence -per line along with the labels. By default, we assume that labels are -words that are prefixed by the string ``__label__``. - -We can specify the label prefix with the ``label_prefix`` param: - -.. code:: python - - classifier = fasttext.supervised('data.train.txt', 'model', label_prefix='__label__') - -equivalent as ``fasttext(1)`` command: - -.. code:: shell - - ./fasttext supervised -input data.train.txt -output model -label '__label__' - -This will output two files: ``model.bin`` and ``model.vec``. - -Once the model was trained, we can evaluate it by computing the -precision at 1 (P@1) on a test set using ``classifier.test`` function: - -.. code:: python - - precision_at_one, nexamples = classifier.test('test.txt') - print 'P@1:', precision_at_one - print 'Number of examples:', nexamples - -This will print the same output to stdout as: - -.. code:: shell - - ./fasttext test model.bin test.txt - -In order to obtain the most likely label for a list of text, we can use -``classifer.predict`` method: - -.. code:: python - - texts = ['example very long text 1', 'example very longtext 2'] - labels = classifier.predict(texts) - print labels - -This interface is equivalent as ``fasttext(1)`` predict command. The -same model with the same input set will have the same prediction. - -API documentation ------------------ - -Skipgram model -~~~~~~~~~~~~~~ - -Train & load skipgram model - -.. code:: python - - model = fasttext.skipgram(params) - -CBOW model -~~~~~~~~~~ - -Train & load CBOW model - -.. code:: python - - model = fasttext.cbow(params) - -Load pre-trained model -~~~~~~~~~~~~~~~~~~~~~~ - -File ``.bin`` that previously trained or generated by fastText can be -loaded using this function - -.. code:: python - - model = fasttext.load_model('model.bin') - -Attributes and methods for the model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Skipgram and CBOW model have the following atributes & methods - -.. code:: python - - model.model_name # Model name - model.words # List of words in the dictionary - model.dim # Size of word vector - model.ws # Size of context window - model.epoch # Number of epochs - model.min_count # Minimal number of word occurences - model.neg # Number of negative sampled - model.word_ngrams # Max length of word ngram - model.loss_name # Loss function name - model.bucket # Number of buckets - model.minn # Min length of char ngram - model.maxn # Max length of char ngram - model.lr_update_rate # Rate of updates for the learning rate - model.t # Value of sampling threshold - model.get_vector(word) # Get the vector of specified word - model[word] # Get the vector of specified word - -Supervised model -~~~~~~~~~~~~~~~~ - -Train & load the classifier - -.. code:: python - - classifier = fasttext.supervised(params) - -Load pre-trained classifier -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -File ``.bin`` that previously trained or generated by fastText can be -loaded using this function. - -.. code:: shell - - ./fasttext supervised -input train.txt -output classifier -label 'some_prefix' - -.. code:: python - - classifier = fasttext.load_model('classifier.bin', label_prefix='some_prefix') - -Test classifier -~~~~~~~~~~~~~~~ - -This is equivalent as ``fasttext(1)`` test command. The test using the -same model and test set will produce the same value for the precision at -one and the number of examples. - -.. code:: python - - precision_at_one, nexamples = classifier.test(test_file) - -Predict the most-likely label of texts -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This interface is equivalent as ``fasttext(1)`` predict command. - -``texts`` is an array of string - -.. code:: python - - labels = classifier.predict(texts) - -Attributes and methods for the classifier -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Classifier have the following atributes & methods - -.. code:: python - - classifier.labels # List of labels - classifier.label_prefix # Prefix of the label - classifier.dim # Size of word vector - classifier.ws # Size of context window - classifier.epoch # Number of epochs - classifier.min_count # Minimal number of word occurences - classifier.neg # Number of negative sampled - classifier.word_ngrams # Max length of word ngram - classifier.loss_name # Loss function name - classifier.bucket # Number of buckets - classifier.minn # Min length of char ngram - classifier.maxn # Max length of char ngram - classifier.lr_update_rate # Rate of updates for the learning rate - classifier.t # Value of sampling threshold - classifier.test(filename) # Test the classifier - classifier.predict(texts) # Predict the most likely label - -Params -~~~~~~ - -List of available ``params`` and their default value: - -:: - - For Skipgram, CBOW and Supervised model - input training file path - output output file path - lr learning rate [0.05] - lr_update_rate change the rate of updates for the learning rate [100] - dim size of word vectors [100] - ws size of the context window [5] - epoch number of epochs [5] - min_count minimal number of word occurences [1] - neg number of negatives sampled [5] - word_ngrams max length of word ngram [1] - loss loss function {ns, hs, softmax} [ns] - bucket number of buckets [2000000] - minn min length of char ngram [3] - maxn max length of char ngram [6] - thread number of threads [12] - t sampling threshold [0.0001] - silent disable the log output from the C++ extension [1] - - For Supervised model only - label_prefix Prefix of the label name [__label__] - -References ----------- - -Enriching Word Vectors with Subword Information -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -[1] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, `*Enriching Word -Vectors with Subword -Information* `__ - -:: - - @article{bojanowski2016enriching, - title={Enriching Word Vectors with Subword Information}, - author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas}, - journal={arXiv preprint arXiv:1607.04606}, - year={2016} - } - -Bag of Tricks for Efficient Text Classification -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -[2] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, `*Bag of Tricks for -Efficient Text -Classification* `__ - -:: - - @article{joulin2016bag, - title={Bag of Tricks for Efficient Text Classification}, - author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas}, - journal={arXiv preprint arXiv:1607.01759}, - year={2016} - } - -(\* These authors contributed equally.) - -Join the fastText community ---------------------------- - -- Facebook page: https://www.facebook.com/groups/1174547215919768 -- Google group: - https://groups.google.com/forum/#!forum/fasttext-library - -.. |Build Status| image:: https://travis-ci.org/salestock/fastText.py.svg?branch=master - :target: https://travis-ci.org/salestock/fastText.py -.. |PyPI version| image:: https://badge.fury.io/py/fasttext.svg - :target: https://badge.fury.io/py/fasttext From c06d535465f924dee4c974c73aa87bf504ea8504 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 19 Aug 2016 02:22:45 +0700 Subject: [PATCH 037/109] Add fasttext.__VERSION__ --- fasttext/__init__.py | 2 ++ setup.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fasttext/__init__.py b/fasttext/__init__.py index 7804d21..cc45fba 100644 --- a/fasttext/__init__.py +++ b/fasttext/__init__.py @@ -2,3 +2,5 @@ from .fasttext import cbow from .fasttext import load_model from .fasttext import supervised + +__VERSION__ = '0.6.2' diff --git a/setup.py b/setup.py index fa2e7c4..91bdf86 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,9 @@ from Cython.Build import cythonize import unittest +# For __VERSION__ +import fasttext + # Define the C++ extension extensions = [ Extension('*', @@ -23,7 +26,7 @@ # Package details setup( name='fasttext', - version='0.6.2', + version=fasttext.__VERSION__, author='Bayu Aldi Yansyah', author_email='bayualdiyansyah@gmail.com', url='https://github.com/pyk/fastText.py', From 3da7bb37dfa55f45b2ce7b569603293d4d84b569 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 19 Aug 2016 14:17:54 +0700 Subject: [PATCH 038/109] Fix allocation error in FastTextModel::setMatrix --- fasttext/fasttext.pyx | 7 ++++++- fasttext/interface.cc | 2 +- fasttext/interface.h | 2 +- fasttext/interface.pxd | 5 ++++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index 68c4933..bd8b352 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -103,13 +103,18 @@ cdef class FastTextModelWrapper: # label_prefix is an optional argument to load the supervised model # prefix will be removed from the label name and stored in the model.labels def load_model(filename, label_prefix=''): + # Check if the filename is readable if not os.path.isfile(filename): raise ValueError('fastText: trained model cannot be opened!') model = FastTextModelWrapper() filename_bytes = bytes(filename, 'utf-8') - loadModelWrapper(filename_bytes, model.fm) + try: + loadModelWrapper(filename_bytes, model.fm) + except: + raise Exception('fastText: Cannot load ' + filename + + ' due to C++ extension failed to allocate the memory') model_name = model.fm.modelName dictionary = model.fm.getDictionary() diff --git a/fasttext/interface.cc b/fasttext/interface.cc index 5d07a22..29a98c0 100644 --- a/fasttext/interface.cc +++ b/fasttext/interface.cc @@ -36,7 +36,7 @@ void FastTextModel::setDict(Dictionary dict) _dict = dict; } -void FastTextModel::setMatrix(Matrix input, Matrix output) +void FastTextModel::setMatrix(Matrix& input, Matrix& output) { _input_matrix = input; _output_matrix = output; diff --git a/fasttext/interface.h b/fasttext/interface.h index 1392085..9a51524 100644 --- a/fasttext/interface.h +++ b/fasttext/interface.h @@ -40,7 +40,7 @@ class FastTextModel { void addWord(std::string word); void setDict(Dictionary dict); - void setMatrix(Matrix input, Matrix output); + void setMatrix(Matrix& input, Matrix& output); void setArg(Args arg); Dictionary getDictionary(); diff --git a/fasttext/interface.pxd b/fasttext/interface.pxd index 6b2a963..7a6fe34 100644 --- a/fasttext/interface.pxd +++ b/fasttext/interface.pxd @@ -41,6 +41,9 @@ cdef extern from "interface.h": Dictionary getDictionary() void trainWrapper(int argc, char **argvm, int silent) - void loadModelWrapper(string filename, FastTextModel& model) + + # Add 'except +' to the function declaration to let Cython safely raise an + # appropriate Python exception instead + void loadModelWrapper(string filename, FastTextModel& model) except + From ea50b7179c62c7b495e55f36a6879bbda1d86574 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 19 Aug 2016 14:30:49 +0700 Subject: [PATCH 039/109] Update test for the classifier --- .gitignore | 5 +-- Makefile | 35 ++++++++++--------- ...pred_test.txt => classifier_pred_test.txt} | 0 ...{supervised_test.py => classifier_test.py} | 30 ++++++++-------- ...ed_params_test.txt => classifier_test.txt} | 0 test/download_dbpedia.sh | 27 ++++++++++++++ 6 files changed, 65 insertions(+), 32 deletions(-) rename test/{supervised_pred_test.txt => classifier_pred_test.txt} (100%) rename test/{supervised_test.py => classifier_test.py} (82%) rename test/{supervised_params_test.txt => classifier_test.txt} (100%) create mode 100644 test/download_dbpedia.sh diff --git a/.gitignore b/.gitignore index 176a0a3..65d35d7 100644 --- a/.gitignore +++ b/.gitignore @@ -22,8 +22,9 @@ facebookresearch-fasttext-* .eggs/ # For test -test/supervised_test_result.txt -test/supervised_pred_result.txt +test/classifier_pred_result.txt +test/classifier_test_result.txt +test/dbpedia.train # Misc TODO diff --git a/Makefile b/Makefile index 98e8f3c..e66134a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: install test -test: test-skipgram test-cbow test-supervised +test: test-skipgram test-cbow test-classifier buildext: python setup.py build_ext --inplace @@ -20,7 +20,7 @@ README.rst: README.md upload: README.rst python setup.py sdist upload -install-dev: +install-dev: README.rst python setup.py develop .PHONY: install-dev @@ -48,22 +48,25 @@ test-cbow: fasttext/cpp/fasttext test/cbow_params_test.bin python test/cbow_test.py --verbose # Test for classifier -test/supervised_params_test.bin: - ./fasttext/cpp/fasttext supervised -input test/supervised_params_test.txt \ - -output test/supervised_params_test -dim 10 -lr 0.1 -wordNgrams 2 \ +test/dbpedia.train: test/download_dbpedia.sh + sh test/download_dbpedia.sh # Download & normalize training file + +test/classifier.bin: test/dbpedia.train + ./fasttext/cpp/fasttext supervised -input test/dbpedia.train \ + -output test/classifier -dim 100 -lr 0.1 -wordNgrams 2 \ -minCount 1 -bucket 2000000 -epoch 5 -thread 4 -test/supervised_test_result.txt: test/supervised_params_test.bin - ./fasttext/cpp/fasttext test test/supervised_params_test.bin \ - test/supervised_params_test.txt > test/supervised_test_result.txt +test/classifier_test_result.txt: test/classifier.bin + ./fasttext/cpp/fasttext test test/classifier.bin \ + test/classifier_test.txt > test/classifier_test_result.txt -test/supervised_pred_result.txt: test/supervised_params_test.bin - ./fasttext/cpp/fasttext predict test/supervised_params_test.bin \ - test/supervised_pred_test.txt > \ - test/supervised_pred_result.txt +test/classifier_pred_result.txt: test/classifier.bin + ./fasttext/cpp/fasttext predict test/classifier.bin \ + test/classifier_pred_test.txt > \ + test/classifier_pred_result.txt -test-supervised: fasttext/cpp/fasttext test/supervised_params_test.bin \ - test/supervised_test_result.txt \ - test/supervised_pred_result.txt - python test/supervised_test.py --verbose +test-classifier: fasttext/cpp/fasttext test/classifier.bin \ + test/classifier_test_result.txt \ + test/classifier_pred_result.txt + python test/classifier_test.py --verbose --failfast diff --git a/test/supervised_pred_test.txt b/test/classifier_pred_test.txt similarity index 100% rename from test/supervised_pred_test.txt rename to test/classifier_pred_test.txt diff --git a/test/supervised_test.py b/test/classifier_test.py similarity index 82% rename from test/supervised_test.py rename to test/classifier_test.py index 928da34..8d561de 100644 --- a/test/supervised_test.py +++ b/test/classifier_test.py @@ -9,13 +9,14 @@ import fasttext as ft -supervised_file = path.join(path.dirname(__file__), 'supervised_params_test.bin') -input_file = path.join(path.dirname(__file__), 'supervised_params_test.txt') -pred_file = path.join(path.dirname(__file__), 'supervised_pred_test.txt') -output = path.join(path.dirname(__file__), 'generated_supervised') -test_result = path.join(path.dirname(__file__), 'supervised_test_result.txt') -pred_result = path.join(path.dirname(__file__), 'supervised_pred_result.txt') -test_file = input_file # Only for test +current_dir = path.dirname(__file__) +classifier_bin = path.join(current_dir, 'classifier.bin') +input_file = path.join(current_dir, 'dbpedia.train') +pred_file = path.join(current_dir, 'classifier_pred_test.txt') +output = path.join(current_dir, 'generated_classifier') +test_result = path.join(current_dir, 'classifier_test_result.txt') +pred_result = path.join(current_dir, 'classifier_pred_result.txt') +test_file = path.join(current_dir, 'classifier_test.txt') def read_labels(filename, label_prefix, unique=True): labels = [] @@ -44,18 +45,18 @@ def read_labels(filename, label_prefix, unique=True): labels.append(label) return labels -# Test to make sure that supervised interface run correctly -class TestSupervisedModel(unittest.TestCase): - def test_load_supervised_model(self): +# Test to make sure that classifier interface run correctly +class TestClassifierModel(unittest.TestCase): + def test_load_classifier_model(self): label_prefix='__label__' - model = ft.load_model(supervised_file, label_prefix=label_prefix) + model = ft.load_model(classifier_bin, label_prefix=label_prefix) # Make sure the model is returned correctly self.assertEqual(model.model_name, 'supervised') # Make sure all params loaded correctly # see Makefile on target test-supervised for the params - self.assertEqual(model.dim, 10) + self.assertEqual(model.dim, 100) self.assertEqual(model.word_ngrams, 2) self.assertEqual(model.min_count, 1) self.assertEqual(model.epoch, 5) @@ -108,18 +109,19 @@ def test_classifier_test(self): num_examples = int(lines[1][20:].strip()) # Load and test using the same model and test set - classifier = ft.load_model(supervised_file, label_prefix='__label__') + classifier = ft.load_model(classifier_bin, label_prefix='__label__') p_at_1, num_ex = classifier.test(test_file) # Make sure that the test result is the same as the result generated # by fasttext(1) + p_at_1 = float("{0:.2f}".format(p_at_1)) self.assertEqual(p_at_1, precision_at_one) self.assertEqual(num_ex, num_examples) def test_classifier_predict(self): label_prefix = '__label__' # Load the pre-trained classifier - classifier = ft.load_model(supervised_file, label_prefix=label_prefix) + classifier = ft.load_model(classifier_bin, label_prefix=label_prefix) # Read texts from the pred_file, prediction made by fasttext(1) texts = [] diff --git a/test/supervised_params_test.txt b/test/classifier_test.txt similarity index 100% rename from test/supervised_params_test.txt rename to test/classifier_test.txt diff --git a/test/download_dbpedia.sh b/test/download_dbpedia.sh new file mode 100644 index 0000000..2e817a2 --- /dev/null +++ b/test/download_dbpedia.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# Modified version of fasttext/cpp/classification-example.sh + +# Download and normalize data dbpedia.train +# Run: +# % sh test/download_dbpedia.sh + +myshuf() { + perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@"; +} + +normalize_text() { + tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \ + sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/
/ /g' \ + -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ + -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf +} + +echo "Downloading the dbpedia_csv.tar.gz ..." +wget -c "https://googledrive.com/host/0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k" \ + -O test/dbpedia_csv.tar.gz + +echo "Extract dbpedia_csv.tar.gz to test/" +tar xzvf test/dbpedia_csv.tar.gz" -C test/ + +echo "Creating the test/dbpedia.train ..." +cat test/dbpedia_csv/train.csv | normalize_text > test/dbpedia.train From b14d5416140fad93e87232cb64df245bb5a9af8a Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 19 Aug 2016 18:40:11 +0700 Subject: [PATCH 040/109] Remove --failfast on test-classifier --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e66134a..69a422d 100644 --- a/Makefile +++ b/Makefile @@ -68,5 +68,5 @@ test/classifier_pred_result.txt: test/classifier.bin test-classifier: fasttext/cpp/fasttext test/classifier.bin \ test/classifier_test_result.txt \ test/classifier_pred_result.txt - python test/classifier_test.py --verbose --failfast + python test/classifier_test.py --verbose From a708b5f1fba165a8ddfa51488650fde078c215d6 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 19 Aug 2016 18:40:42 +0700 Subject: [PATCH 041/109] Read version from a file --- fasttext/VERSION | 1 + fasttext/__init__.py | 5 ++++- setup.py | 8 +++++--- 3 files changed, 10 insertions(+), 4 deletions(-) create mode 100644 fasttext/VERSION diff --git a/fasttext/VERSION b/fasttext/VERSION new file mode 100644 index 0000000..b616048 --- /dev/null +++ b/fasttext/VERSION @@ -0,0 +1 @@ +0.6.2 diff --git a/fasttext/__init__.py b/fasttext/__init__.py index cc45fba..6af7367 100644 --- a/fasttext/__init__.py +++ b/fasttext/__init__.py @@ -3,4 +3,7 @@ from .fasttext import load_model from .fasttext import supervised -__VERSION__ = '0.6.2' +@property +def __VERSION__(): + with open('VERSION') as f: + return f.read().strip() diff --git a/setup.py b/setup.py index 91bdf86..62a50bb 100644 --- a/setup.py +++ b/setup.py @@ -3,8 +3,10 @@ from Cython.Build import cythonize import unittest -# For __VERSION__ -import fasttext +# Read the fastText.py version +def read_version(): + with open('fasttext/VERSION') as f: + return f.read().strip() # Define the C++ extension extensions = [ @@ -26,7 +28,7 @@ # Package details setup( name='fasttext', - version=fasttext.__VERSION__, + version=read_version(), author='Bayu Aldi Yansyah', author_email='bayualdiyansyah@gmail.com', url='https://github.com/pyk/fastText.py', From 42657e9a3b050a682388b1d3675823d3ef1363c0 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 19 Aug 2016 18:46:17 +0700 Subject: [PATCH 042/109] Add README.rst to the git index --- .gitignore | 1 - README.rst | 370 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 370 insertions(+), 1 deletion(-) create mode 100644 README.rst diff --git a/.gitignore b/.gitignore index 65d35d7..54f7624 100644 --- a/.gitignore +++ b/.gitignore @@ -28,4 +28,3 @@ test/dbpedia.train # Misc TODO -README.rst diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..8184104 --- /dev/null +++ b/README.rst @@ -0,0 +1,370 @@ +fasttext |Build Status| |PyPI version| +====================================== + +fasttext is a Python interface for `Facebook +fastText `__. + +Requirements +------------ + +fasttext support Python 2.6 or newer. It requires +`Cython `__ in order to build the +C++ extension. + +Installation +------------ + +.. code:: shell + + pip install fasttext + +Example usage +------------- + +This package has two main use cases: word representation learning and +text classification. + +These were described in the two papers +`1 <#enriching-word-vectors-with-subword-information>`__ and +`2 <#bag-of-tricks-for-efficient-text-classification>`__. + +Word representation learning +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In order to learn word vectors, as described in +`1 <#enriching-word-vectors-with-subword-information>`__, we can use +``fasttext.skipgram`` and ``fasttext.cbow`` function like the following: + +.. code:: python + + import fasttext + + # Skipgram model + model = fasttext.skipgram('data.txt', 'model') + print model.words # list of words in dictionary + + # CBOW model + model = fasttext.cbow('data.txt', 'model') + print model.words # list of words in dictionary + +where ``data.txt`` is a training file containing ``utf-8`` encoded text. +By default the word vectors will take into account character n-grams +from 3 to 6 characters. + +At the end of optimization the program will save two files: +``model.bin`` and ``model.vec``. + +``model.vec`` is a text file containing the word vectors, one per line. +``model.bin`` is a binary file containing the parameters of the model +along with the dictionary and all hyper parameters. + +The binary file can be used later to compute word vectors or to restart +the optimization. + +The following ``fasttext(1)`` command is equivalent + +.. code:: shell + + # Skipgram model + ./fasttext skipgram -input data.txt -output model + + # CBOW model + ./fasttext cbow -input data.txt -output model + +Obtaining word vectors for out-of-vocabulary words +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The previously trained model can be used to compute word vectors for +out-of-vocabulary words. + +.. code:: python + + print model.get_vector('king') + # or just use a nice syntax + print model['king'] # get the vector of the word 'king' + +the following ``fasttext(1)`` command is equivalent: + +.. code:: shell + + echo "king" | ./fasttext print-vectors model.bin + +This will output the vector of word ``king`` to the standard output. + +Load pre-trained model +~~~~~~~~~~~~~~~~~~~~~~ + +We can use ``fasttext.load_model`` to load pre-trained model: + +.. code:: python + + model = fasttext.load_model('model.bin') + print model.words # list of words in dictionary + print model['king'] # get the vector of the word 'king' + +Text classification +~~~~~~~~~~~~~~~~~~~ + +This package can also be used to train supervised text classifiers and +load pre-trained classifier from fastText. + +In order to train a text classifier using the method described in +`2 <#bag-of-tricks-for-efficient-text-classification>`__, we can use the +following function: + +.. code:: python + + classifier = fasttext.supervised('data.train.txt', 'model') + +equivalent as ``fasttext(1)`` command: + +.. code:: shell + + ./fasttext supervised -input data.train.txt -output model + +where ``data.train.txt`` is a text file containing a training sentence +per line along with the labels. By default, we assume that labels are +words that are prefixed by the string ``__label__``. + +We can specify the label prefix with the ``label_prefix`` param: + +.. code:: python + + classifier = fasttext.supervised('data.train.txt', 'model', label_prefix='__label__') + +equivalent as ``fasttext(1)`` command: + +.. code:: shell + + ./fasttext supervised -input data.train.txt -output model -label '__label__' + +This will output two files: ``model.bin`` and ``model.vec``. + +Once the model was trained, we can evaluate it by computing the +precision at 1 (P@1) on a test set using ``classifier.test`` function: + +.. code:: python + + precision_at_one, nexamples = classifier.test('test.txt') + print 'P@1:', precision_at_one + print 'Number of examples:', nexamples + +This will print the same output to stdout as: + +.. code:: shell + + ./fasttext test model.bin test.txt + +In order to obtain the most likely label for a list of text, we can use +``classifer.predict`` method: + +.. code:: python + + texts = ['example very long text 1', 'example very longtext 2'] + labels = classifier.predict(texts) + print labels + +This interface is equivalent as ``fasttext(1)`` predict command. The +same model with the same input set will have the same prediction. + +API documentation +----------------- + +Skipgram model +~~~~~~~~~~~~~~ + +Train & load skipgram model + +.. code:: python + + model = fasttext.skipgram(params) + +CBOW model +~~~~~~~~~~ + +Train & load CBOW model + +.. code:: python + + model = fasttext.cbow(params) + +Load pre-trained model +~~~~~~~~~~~~~~~~~~~~~~ + +File ``.bin`` that previously trained or generated by fastText can be +loaded using this function + +.. code:: python + + model = fasttext.load_model('model.bin') + +Attributes and methods for the model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Skipgram and CBOW model have the following atributes & methods + +.. code:: python + + model.model_name # Model name + model.words # List of words in the dictionary + model.dim # Size of word vector + model.ws # Size of context window + model.epoch # Number of epochs + model.min_count # Minimal number of word occurences + model.neg # Number of negative sampled + model.word_ngrams # Max length of word ngram + model.loss_name # Loss function name + model.bucket # Number of buckets + model.minn # Min length of char ngram + model.maxn # Max length of char ngram + model.lr_update_rate # Rate of updates for the learning rate + model.t # Value of sampling threshold + model.get_vector(word) # Get the vector of specified word + model[word] # Get the vector of specified word + +Supervised model +~~~~~~~~~~~~~~~~ + +Train & load the classifier + +.. code:: python + + classifier = fasttext.supervised(params) + +Load pre-trained classifier +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +File ``.bin`` that previously trained or generated by fastText can be +loaded using this function. + +.. code:: shell + + ./fasttext supervised -input train.txt -output classifier -label 'some_prefix' + +.. code:: python + + classifier = fasttext.load_model('classifier.bin', label_prefix='some_prefix') + +Test classifier +~~~~~~~~~~~~~~~ + +This is equivalent as ``fasttext(1)`` test command. The test using the +same model and test set will produce the same value for the precision at +one and the number of examples. + +.. code:: python + + precision_at_one, nexamples = classifier.test(test_file) + +Predict the most-likely label of texts +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This interface is equivalent as ``fasttext(1)`` predict command. + +``texts`` is an array of string + +.. code:: python + + labels = classifier.predict(texts) + +Attributes and methods for the classifier +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Classifier have the following atributes & methods + +.. code:: python + + classifier.labels # List of labels + classifier.label_prefix # Prefix of the label + classifier.dim # Size of word vector + classifier.ws # Size of context window + classifier.epoch # Number of epochs + classifier.min_count # Minimal number of word occurences + classifier.neg # Number of negative sampled + classifier.word_ngrams # Max length of word ngram + classifier.loss_name # Loss function name + classifier.bucket # Number of buckets + classifier.minn # Min length of char ngram + classifier.maxn # Max length of char ngram + classifier.lr_update_rate # Rate of updates for the learning rate + classifier.t # Value of sampling threshold + classifier.test(filename) # Test the classifier + classifier.predict(texts) # Predict the most likely label + +Params +~~~~~~ + +List of available ``params`` and their default value: + +:: + + For Skipgram, CBOW and Supervised model + input training file path + output output file path + lr learning rate [0.05] + lr_update_rate change the rate of updates for the learning rate [100] + dim size of word vectors [100] + ws size of the context window [5] + epoch number of epochs [5] + min_count minimal number of word occurences [1] + neg number of negatives sampled [5] + word_ngrams max length of word ngram [1] + loss loss function {ns, hs, softmax} [ns] + bucket number of buckets [2000000] + minn min length of char ngram [3] + maxn max length of char ngram [6] + thread number of threads [12] + t sampling threshold [0.0001] + silent disable the log output from the C++ extension [1] + + For Supervised model only + label_prefix Prefix of the label name [__label__] + +References +---------- + +Enriching Word Vectors with Subword Information +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +[1] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, `*Enriching Word +Vectors with Subword +Information* `__ + +:: + + @article{bojanowski2016enriching, + title={Enriching Word Vectors with Subword Information}, + author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas}, + journal={arXiv preprint arXiv:1607.04606}, + year={2016} + } + +Bag of Tricks for Efficient Text Classification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +[2] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, `*Bag of Tricks for +Efficient Text +Classification* `__ + +:: + + @article{joulin2016bag, + title={Bag of Tricks for Efficient Text Classification}, + author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas}, + journal={arXiv preprint arXiv:1607.01759}, + year={2016} + } + +(\* These authors contributed equally.) + +Join the fastText community +--------------------------- + +- Facebook page: https://www.facebook.com/groups/1174547215919768 +- Google group: + https://groups.google.com/forum/#!forum/fasttext-library + +.. |Build Status| image:: https://travis-ci.org/salestock/fastText.py.svg?branch=master + :target: https://travis-ci.org/salestock/fastText.py +.. |PyPI version| image:: https://badge.fury.io/py/fasttext.svg + :target: https://badge.fury.io/py/fasttext From b700686fd96ea0335cece5b905d6dc52357ca622 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 19 Aug 2016 18:56:11 +0700 Subject: [PATCH 043/109] Fix test/download_dbpedia.sh --- test/download_dbpedia.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/download_dbpedia.sh b/test/download_dbpedia.sh index 2e817a2..adb7fa1 100644 --- a/test/download_dbpedia.sh +++ b/test/download_dbpedia.sh @@ -21,7 +21,7 @@ wget -c "https://googledrive.com/host/0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k" \ -O test/dbpedia_csv.tar.gz echo "Extract dbpedia_csv.tar.gz to test/" -tar xzvf test/dbpedia_csv.tar.gz" -C test/ +tar xzvf test/dbpedia_csv.tar.gz -C test/ echo "Creating the test/dbpedia.train ..." cat test/dbpedia_csv/train.csv | normalize_text > test/dbpedia.train From dcae6e0fa1c79e4f5e166b3192c98abaa34b0133 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 19 Aug 2016 21:09:42 +0700 Subject: [PATCH 044/109] Redirect stdout to /dev/null --- Makefile | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 69a422d..4c8fb12 100644 --- a/Makefile +++ b/Makefile @@ -28,21 +28,25 @@ fasttext/cpp/fasttext: make --directory fasttext/cpp/ # Test for skipgram model +# Redirect stdout to /dev/null to prevent exceed the log limit size from +# Travis CI test/skipgram_params_test.bin: ./fasttext/cpp/fasttext skipgram -input test/params_test.txt -output \ test/skipgram_params_test -lr 0.025 -dim 100 -ws 5 -epoch 1 \ -minCount 1 -neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 \ - -thread 4 -lrUpdateRate 100 -t 1e-4 + -thread 4 -lrUpdateRate 100 -t 1e-4 >> /dev/null test-skipgram: fasttext/cpp/fasttext test/skipgram_params_test.bin python test/skipgram_test.py --verbose # Test for cbow model +# Redirect stdout to /dev/null to prevent exceed the log limit size from +# Travis CI test/cbow_params_test.bin: ./fasttext/cpp/fasttext cbow -input test/params_test.txt -output \ test/cbow_params_test -lr 0.005 -dim 50 -ws 5 -epoch 1 \ -minCount 1 -neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 \ - -thread 4 -lrUpdateRate 100 -t 1e-4 + -thread 4 -lrUpdateRate 100 -t 1e-4 >> /dev/null test-cbow: fasttext/cpp/fasttext test/cbow_params_test.bin python test/cbow_test.py --verbose @@ -51,10 +55,12 @@ test-cbow: fasttext/cpp/fasttext test/cbow_params_test.bin test/dbpedia.train: test/download_dbpedia.sh sh test/download_dbpedia.sh # Download & normalize training file +# Redirect stdout to /dev/null to prevent exceed the log limit size from +# Travis CI test/classifier.bin: test/dbpedia.train ./fasttext/cpp/fasttext supervised -input test/dbpedia.train \ -output test/classifier -dim 100 -lr 0.1 -wordNgrams 2 \ - -minCount 1 -bucket 2000000 -epoch 5 -thread 4 + -minCount 1 -bucket 2000000 -epoch 5 -thread 4 >> /dev/null test/classifier_test_result.txt: test/classifier.bin ./fasttext/cpp/fasttext test test/classifier.bin \ From c2cd5c604cce588c7c2f685ff0681f9375ed10f6 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 19 Aug 2016 21:25:19 +0700 Subject: [PATCH 045/109] Enable silent mode in tests --- test/cbow_test.py | 2 +- test/classifier_test.py | 2 +- test/skipgram_test.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/cbow_test.py b/test/cbow_test.py index 566d348..0ae9dd5 100644 --- a/test/cbow_test.py +++ b/test/cbow_test.py @@ -59,7 +59,7 @@ def test_train_cbow_model(self): thread=4 lr_update_rate=10000 t=1e-4 - silent=0 + silent=1 # train cbow model model = ft.cbow(input_file, output, lr, dim, ws, epoch, min_count, diff --git a/test/classifier_test.py b/test/classifier_test.py index 8d561de..08e6186 100644 --- a/test/classifier_test.py +++ b/test/classifier_test.py @@ -77,7 +77,7 @@ def test_train_classifier(self): word_ngrams=3 bucket=2000000 thread=4 - silent=0 + silent=1 label_prefix='__label__' # Train the classifier diff --git a/test/skipgram_test.py b/test/skipgram_test.py index 9c9b9e1..52feb6d 100644 --- a/test/skipgram_test.py +++ b/test/skipgram_test.py @@ -59,7 +59,7 @@ def test_train_skipgram_model(self): thread=4 lr_update_rate=10000 t=1e-4 - silent=0 + silent=1 # train skipgram model model = ft.skipgram(input_file, output, lr, dim, ws, epoch, min_count, From 7c1407f32678cc4afb4d75f11c02ec9a0405304c Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 19 Aug 2016 23:27:40 +0700 Subject: [PATCH 046/109] Update v0.6.2 to v0.6.3 --- fasttext/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fasttext/VERSION b/fasttext/VERSION index b616048..844f6a9 100644 --- a/fasttext/VERSION +++ b/fasttext/VERSION @@ -1 +1 @@ -0.6.2 +0.6.3 From f7af9c5fe5ab9cfc50b7e037949ba770c4bd7652 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sat, 20 Aug 2016 07:32:50 +0700 Subject: [PATCH 047/109] Add fasttext/VERSION to the package file --- MANIFEST.in | 1 + fasttext/VERSION | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 6dc8d42..f062fd5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,5 +2,6 @@ global-include *.pyx global-include *.pxd global-include *.cc global-include *.h +global-include fasttext/VERSION prune facebookresearch-fastText-* diff --git a/fasttext/VERSION b/fasttext/VERSION index 844f6a9..d2b13eb 100644 --- a/fasttext/VERSION +++ b/fasttext/VERSION @@ -1 +1 @@ -0.6.3 +0.6.4 From 2ea655c1b9120cb9a5fcb1decdc1d28225aa72fe Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 22 Aug 2016 12:14:07 +0700 Subject: [PATCH 048/109] Add examples, resolve #19 --- examples/classification_example.py | 41 ++++++++++++++++++++++++++++++ examples/wordvector_example.py | 32 +++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 examples/classification_example.py create mode 100644 examples/wordvector_example.py diff --git a/examples/classification_example.py b/examples/classification_example.py new file mode 100644 index 0000000..d699436 --- /dev/null +++ b/examples/classification_example.py @@ -0,0 +1,41 @@ +import fasttext + +# Download the dbpedia.train first on test/ +# and move to the example directory +current_dir = path.dirname(__file__) +input_file = path.join(current_dir, 'dbpedia.train') +output = '/tmp/classifier' +test_file = '../test/classifier_test.txt' + +# set params +dim=10 +lr=0.005 +epoch=1 +min_count=1 +word_ngrams=3 +bucket=2000000 +thread=4 +silent=1 +label_prefix='__label__' + +# Train the classifier +classifier = ft.supervised(input_file, output, dim=dim, lr=lr, epoch=epoch, + min_count=min_count, word_ngrams=word_ngrams, bucket=bucket, + thread=thread, silent=silent, label_prefix=label_prefix) + +# Test the classifier +p_at_1, num_ex = classifier.test(test_file) +print 'P@1:', p_at_1 +print 'Number of examples:', num_ex + +# Predict some text +# (Example text is from dbpedia.train) +texts = ['birchas chaim , yeshiva birchas chaim is a orthodox jewish mesivta \ + high school in lakewood township new jersey . it was founded by rabbi \ + shmuel zalmen stein in 2001 after his father rabbi chaim stein asked \ + him to open a branch of telshe yeshiva in lakewood . as of the 2009-10 \ + school year the school had an enrollment of 76 students and 6 . 6 \ + classroom teachers ( on a fte basis ) for a student–teacher ratio of \ + 11 . 5 1 .'] +labels = classifier.predict(texts) +print labels diff --git a/examples/wordvector_example.py b/examples/wordvector_example.py new file mode 100644 index 0000000..7ce65aa --- /dev/null +++ b/examples/wordvector_example.py @@ -0,0 +1,32 @@ +import fasttext + +INPUT_TXT = '/path/to/file.txt' +OUTPUT_PATH_SKIPGRAM = '/tmp/skipgram' +OUTPUT_PATH_CBOW = '/tmp/cbow' + +# Learn the word representation using skipgram model +skipgram = fasttext.skipgram(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5, + epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, + thread=4, t=1e-4, lr_update_rate=100) + +# Get the vector of some word +print skipgram['word'] + +# Learn the word representation using cbow model +cbow = fasttext.cbow(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5, + epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, + thread=4, t=1e-4, lr_update_rate=100) + +# Get the vector of some word +print cbow['word'] + +# Load pre-trained skipgram model +SKIPGRAM_BIN = OUTPUT_PATH_SKIPGRAM + '.bin' +skipgram = fasttext.load_model(SKIPGRAM_BIN) +print skipgram['word'] + +# Load pre-trained cbow model +CBOW_BIN = OUTPUT_PATH_CBOW + '.bin' +cbow = fasttext.load_model(CBOW_BIN) +print cbow['word'] + From aee999b10439646101a7ed19f52a2e4881e9d00e Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 22 Aug 2016 12:44:05 +0700 Subject: [PATCH 049/109] fastText: Update 86e6b44 to 1826a12 --- fasttext/cpp/LAST_COMMIT | 2 +- fasttext/cpp/README.md | 13 +++++--- fasttext/cpp/src/dictionary.cc | 4 +-- fasttext/cpp/src/fasttext.cc | 60 +++++++++++++++++++++++----------- fasttext/cpp/src/model.cc | 56 ++++++++++++++++++++++++------- fasttext/cpp/src/model.h | 11 +++++-- update-fasttext.sh | 2 +- 7 files changed, 106 insertions(+), 42 deletions(-) diff --git a/fasttext/cpp/LAST_COMMIT b/fasttext/cpp/LAST_COMMIT index 3626474..bed3022 100644 --- a/fasttext/cpp/LAST_COMMIT +++ b/fasttext/cpp/LAST_COMMIT @@ -1 +1 @@ -86e6b44 +1826a12 diff --git a/fasttext/cpp/README.md b/fasttext/cpp/README.md index 2dbbf11..1aefd9a 100644 --- a/fasttext/cpp/README.md +++ b/fasttext/cpp/README.md @@ -89,20 +89,23 @@ $ ./fasttext supervised -input train.txt -output model where `train.txt` is a text file containing a training sentence per line along with the labels. By default, we assume that labels are words that are prefixed by the string `__label__`. This will output two files: `model.bin` and `model.vec`. -Once the model was trained, you can evaluate it by computing the precision at 1 (P@1) on a test set using: +Once the model was trained, you can evaluate it by computing the precision and recall at k (P@k and R@k) on a test set using: ``` -$ ./fasttext test model.bin test.txt +$ ./fasttext test model.bin test.txt k ``` -In order to obtain the most likely label for a piece of text, use: +The argument `k` is optional, and is equal to `1` by default. + +In order to obtain the k most likely labels for a piece of text, use: ``` -$ ./fasttext predict model.bin test.txt +$ ./fasttext predict model.bin test.txt k ``` where `test.txt` contains a piece of text to classify per line. -Doing so will output to the standard output the most likely label per line. +Doing so will print to the standard output the k most likely labels for each line. +The argument `k` is optional, and equal to `1` by default. See `classification-example.sh` for an example use case. In order to reproduce results from the paper [2](#bag-of-tricks-for-efficient-text-classification), run `classification-results.sh`, this will download all the datasets and reproduce the results from Table 1. diff --git a/fasttext/cpp/src/dictionary.cc b/fasttext/cpp/src/dictionary.cc index c1cd3f7..266bc27 100644 --- a/fasttext/cpp/src/dictionary.cc +++ b/fasttext/cpp/src/dictionary.cc @@ -153,7 +153,7 @@ std::string Dictionary::readWord(std::ifstream& fin) std::string word; while (fin.peek() != EOF) { fin.get(c); - if (isspace(c)) { + if (isspace(c) || c == 0) { if (word.empty()) { if (c == '\n') return EOS; continue; @@ -249,11 +249,11 @@ int32_t Dictionary::getLine(std::ifstream& ifs, ifs.seekg(std::streampos(0)); } while (!(token = readWord(ifs)).empty()) { - ntokens++; if (token == EOS) break; int32_t wid = getId(token); if (wid < 0) continue; entry_type type = getType(wid); + ntokens++; if (type == entry_type::word && !discard(wid, uniform(rng))) { words.push_back(wid); } diff --git a/fasttext/cpp/src/fasttext.cc b/fasttext/cpp/src/fasttext.cc index bc22f56..38397a9 100644 --- a/fasttext/cpp/src/fasttext.cc +++ b/fasttext/cpp/src/fasttext.cc @@ -161,8 +161,8 @@ void skipgram(Dictionary& dict, Model& model, } } -void test(Dictionary& dict, Model& model, std::string filename) { - int32_t nexamples = 0; +void test(Dictionary& dict, Model& model, std::string filename, int32_t k) { + int32_t nexamples = 0, nlabels = 0; double precision = 0.0; std::vector line, labels; std::ifstream ifs(filename); @@ -174,22 +174,25 @@ void test(Dictionary& dict, Model& model, std::string filename) { dict.getLine(ifs, line, labels, model.rng); dict.addNgrams(line, args.wordNgrams); if (labels.size() > 0 && line.size() > 0) { - int32_t i = model.predict(line); - if (std::find(labels.begin(), labels.end(), i) != labels.end()) { - precision += 1.0; + std::vector> predictions; + model.predict(line, k, predictions); + for (auto it = predictions.cbegin(); it != predictions.cend(); it++) { + if (std::find(labels.begin(), labels.end(), it->second) != labels.end()) { + precision += 1.0; + } } nexamples++; + nlabels += labels.size(); } } ifs.close(); std::cout << std::setprecision(3); - std::cout << "P@1: " << precision / nexamples << std::endl; + std::cout << "P@" << k << ": " << precision / (k * nexamples) << std::endl; + std::cout << "R@" << k << ": " << precision / nlabels << std::endl; std::cout << "Number of examples: " << nexamples << std::endl; } -void predict(Dictionary& dict, Model& model, std::string filename) { - int32_t nexamples = 0; - double precision = 0.0; +void predict(Dictionary& dict, Model& model, std::string filename, int32_t k) { std::vector line, labels; std::ifstream ifs(filename); if (!ifs.is_open()) { @@ -199,12 +202,19 @@ void predict(Dictionary& dict, Model& model, std::string filename) { while (ifs.peek() != EOF) { dict.getLine(ifs, line, labels, model.rng); dict.addNgrams(line, args.wordNgrams); - if (line.size() > 0) { - int32_t i = model.predict(line); - std::cout << dict.getLabel(i) << std::endl; - } else { + if (line.empty()) { std::cout << "n/a" << std::endl; + continue; + } + std::vector> predictions; + model.predict(line, k, predictions); + for (auto it = predictions.cbegin(); it != predictions.cend(); it++) { + if (it != predictions.cbegin()) { + std::cout << ' '; + } + std::cout << dict.getLabel(it->second); } + std::cout << std::endl; } ifs.close(); } @@ -275,17 +285,19 @@ void printUsage() { void printTestUsage() { std::cout - << "usage: fasttext test \n\n" + << "usage: fasttext test []\n\n" << " model filename\n" << " test data filename\n" + << " (optional; 1 by default) predict top k labels\n" << std::endl; } void printPredictUsage() { std::cout - << "usage: fasttext predict \n\n" + << "usage: fasttext predict []\n\n" << " model filename\n" << " test data filename\n" + << " (optional; 1 by default) predict top k labels\n" << std::endl; } @@ -297,7 +309,12 @@ void printPrintVectorsUsage() { } void test(int argc, char** argv) { - if (argc != 4) { + int32_t k; + if (argc == 4) { + k = 1; + } else if (argc == 5) { + k = atoi(argv[4]); + } else { printTestUsage(); exit(EXIT_FAILURE); } @@ -306,12 +323,17 @@ void test(int argc, char** argv) { loadModel(std::string(argv[2]), dict, input, output); Model model(input, output, args.dim, args.lr, 1); model.setTargetCounts(dict.getCounts(entry_type::label)); - test(dict, model, std::string(argv[3])); + test(dict, model, std::string(argv[3]), k); exit(0); } void predict(int argc, char** argv) { - if (argc != 4) { + int32_t k; + if (argc == 4) { + k = 1; + } else if (argc == 5) { + k = atoi(argv[4]); + } else { printPredictUsage(); exit(EXIT_FAILURE); } @@ -320,7 +342,7 @@ void predict(int argc, char** argv) { loadModel(std::string(argv[2]), dict, input, output); Model model(input, output, args.dim, args.lr, 1); model.setTargetCounts(dict.getCounts(entry_type::label)); - predict(dict, model, std::string(argv[3])); + predict(dict, model, std::string(argv[3]), k); exit(0); } diff --git a/fasttext/cpp/src/model.cc b/fasttext/cpp/src/model.cc index a6de579..98089a8 100644 --- a/fasttext/cpp/src/model.cc +++ b/fasttext/cpp/src/model.cc @@ -95,34 +95,66 @@ real Model::softmax(int32_t target) { return -utils::log(output_[target]); } -int32_t Model::predict(const std::vector& input) { +void Model::computeHidden(const std::vector& input) { hidden_.zero(); for (auto it = input.cbegin(); it != input.cend(); ++it) { hidden_.addRow(wi_, *it); } hidden_.mul(1.0 / input.size()); +} + +bool Model::comparePairs(const std::pair &l, + const std::pair &r) { + return l.first > r.first; +} +void Model::predict(const std::vector& input, int32_t k, + std::vector>& heap) { + assert(k > 0); + heap.reserve(k + 1); + computeHidden(input); if (args.loss == loss_name::hs) { - real max = -1e10; - int32_t argmax = -1; - dfs(2 * osz_ - 2, 0.0, max, argmax); - return argmax; + dfs(k, 2 * osz_ - 2, 0.0, heap); } else { output_.mul(wo_, hidden_); - return output_.argmax(); + findKBest(k, heap); + } + std::sort_heap(heap.begin(), heap.end(), comparePairs); +} + +void Model::findKBest(int32_t k, std::vector>& heap) { + for (int32_t i = 0; i < osz_; i++) { + if (heap.size() == k && output_[i] < heap.front().first) { + continue; + } + heap.push_back(std::make_pair(output_[i], i)); + std::push_heap(heap.begin(), heap.end(), comparePairs); + if (heap.size() > k) { + std::pop_heap(heap.begin(), heap.end(), comparePairs); + heap.pop_back(); + } } } -void Model::dfs(int32_t node, real score, real& max, int32_t& argmax) { - if (score < max) return; +void Model::dfs(int32_t k, int32_t node, real score, + std::vector>& heap) { + if (heap.size() == k && score < heap.front().first) { + return; + } + if (tree[node].left == -1 && tree[node].right == -1) { - max = score; - argmax = node; + heap.push_back(std::make_pair(score, node)); + std::push_heap(heap.begin(), heap.end(), comparePairs); + if (heap.size() > k) { + std::pop_heap(heap.begin(), heap.end(), comparePairs); + heap.pop_back(); + } return; } + real f = utils::sigmoid(wo_.dotRow(hidden_, node - osz_)); - dfs(tree[node].left, score + utils::log(1.0 - f), max, argmax); - dfs(tree[node].right, score + utils::log(f), max, argmax); + dfs(k, tree[node].left, score + utils::log(1.0 - f), heap); + dfs(k, tree[node].right, score + utils::log(f), heap); } real Model::update(const std::vector& input, int32_t target) { diff --git a/fasttext/cpp/src/model.h b/fasttext/cpp/src/model.h index 5fc1702..9a6a48a 100644 --- a/fasttext/cpp/src/model.h +++ b/fasttext/cpp/src/model.h @@ -12,6 +12,7 @@ #include #include +#include #include "matrix.h" #include "vector.h" @@ -38,6 +39,9 @@ class Model { static real lr_; + static bool comparePairs(const std::pair&, + const std::pair&); + std::vector negatives; size_t negpos; std::vector< std::vector > paths; @@ -58,9 +62,12 @@ class Model { real hierarchicalSoftmax(int32_t); real softmax(int32_t); - int32_t predict(const std::vector&); - void dfs(int32_t, real, real&, int32_t&); + void predict(const std::vector&, int32_t, + std::vector>&); + void dfs(int32_t, int32_t, real, std::vector>&); + void findKBest(int32_t, std::vector>&); real update(const std::vector&, int32_t); + void computeHidden(const std::vector&); void setTargetCounts(const std::vector&); void initTableNegatives(const std::vector&); diff --git a/update-fasttext.sh b/update-fasttext.sh index 926f3e1..ffbb148 100644 --- a/update-fasttext.sh +++ b/update-fasttext.sh @@ -1,4 +1,4 @@ -NEW_VERSION=86e6b44 +NEW_VERSION=1826a12 CURRENT_VERSION=$(cat fasttext/cpp/LAST_COMMIT) if [ "$NEW_VERSION" = "$CURRENT_VERSION" ]; then From f681e919defe87909de993bcb1206904eb109f27 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 22 Aug 2016 16:21:06 +0700 Subject: [PATCH 050/109] Implement k-best labels for classifier & Add recall to the test results --- fasttext/fasttext.pyx | 28 +++++++++++++++++----------- fasttext/interface.cc | 41 +++++++++++++++++++++++++++++++---------- fasttext/interface.h | 4 ++-- fasttext/interface.pxd | 4 ++-- fasttext/model.py | 28 ++++++++++++++++++++-------- 5 files changed, 72 insertions(+), 33 deletions(-) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index bd8b352..90d3737 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -8,11 +8,13 @@ from interface cimport Dictionary # Python/C++ standart libraries from libc.stdlib cimport malloc, free from libcpp.string cimport string +from libcpp.vector cimport vector # Python module import os from model import WordVectorModel from model import SupervisedModel +from model import ClassifierTestResult as CTRes from builtins import bytes # This class wrap C++ class FastTextModel, so it can be accessed via Python @@ -29,19 +31,23 @@ cdef class FastTextModelWrapper: word_bytes = bytes(word, 'utf-8') return self.fm.getVectorWrapper(word_bytes) - def classifier_test(self, test_file): + def classifier_test(self, test_file, k): test_file = bytes(test_file, 'utf-8') - result = self.fm.classifierTest(test_file) - precision_at_one = float(result[0]) - num_examples = int(result[1]) - return precision_at_one, num_examples - - def classifier_predict(self, text): - cdef string cpp_string + result = self.fm.classifierTest(test_file, k) + precision = float(result[0]) + recall = float(result[1]) + nexamples = int(result[2]) + return CTRes(precision, recall, nexamples) + + def classifier_predict(self, text, k): + cdef vector[string] raw_labels text_bytes = bytes(text, 'utf-8') - cpp_string = self.fm.classifierPredict(text_bytes) - label = cpp_string.decode('utf-8') - return label + labels = [] + raw_labels = self.fm.classifierPredict(text_bytes, k) + for raw_label in raw_labels: + label = raw_label.decode('utf-8') + labels.append(label) + return labels @property def dim(self): diff --git a/fasttext/interface.cc b/fasttext/interface.cc index 29a98c0..b3d303e 100644 --- a/fasttext/interface.cc +++ b/fasttext/interface.cc @@ -84,7 +84,8 @@ std::vector FastTextModel::getVectorWrapper(std::string word) return vector; } -std::vector FastTextModel::classifierTest(std::string filename) +std::vector FastTextModel::classifierTest(std::string filename, + int32_t k) { /* Initialize the model * We use default value of learning rate here, since the fasttext(1) test @@ -96,8 +97,10 @@ std::vector FastTextModel::classifierTest(std::string filename) model.setTargetCounts(_dict.getCounts(entry_type::label)); int32_t nexamples = 0; + int32_t nlabels = 0; double precision = 0.0; - std::vector line, labels; + std::vector line; + std::vector labels; std::ifstream ifs(filename); if(!ifs.is_open()) { std::cerr << "interface.cc: Test file cannot be opened!" << std::endl; @@ -108,21 +111,32 @@ std::vector FastTextModel::classifierTest(std::string filename) _dict.getLine(ifs, line, labels, model.rng); _dict.addNgrams(line, wordNgrams); if(labels.size() > 0 && line.size() > 0) { - int32_t i = model.predict(line); - if(std::find(labels.begin(), labels.end(), i) != labels.end()) { - precision += 1.0; + std::vector> predictions; + model.predict(line, k, predictions); + for(auto it = predictions.cbegin(); it != predictions.cend(); + it++) { + int32_t i = it->second; + if(std::find(labels.begin(), labels.end(), i) + != labels.end()) { + precision += 1.0; + } } nexamples++; + nlabels += labels.size(); } } ifs.close(); std::setprecision(3); - std::vector result = {precision/nexamples, (double)nexamples}; + std::vector result; + result.push_back(precision/(k * nexamples)); + result.push_back(precision/nlabels); + result.push_back((double)nexamples); return result; } -std::string FastTextModel::classifierPredict(std::string text) +std::vector FastTextModel::classifierPredict(std::string text, + int32_t k) { /* Initialize the model * We use default value of learning rate here, since the fasttext(1) test @@ -156,11 +170,18 @@ std::string FastTextModel::classifierPredict(std::string text) } _dict.addNgrams(text_word_ids, wordNgrams); + std::vector labels; if(text_word_ids.size() > 0) { - int32_t i = model.predict(text_word_ids); - return _dict.getLabel(i); + std::vector> predictions; + + model.predict(text_word_ids, k, predictions); + for(auto it = predictions.cbegin(); it != predictions.cend(); it++) { + labels.push_back(_dict.getLabel(it->second)); + } + + return labels; } else { - return "n/a"; + return labels; } } diff --git a/fasttext/interface.h b/fasttext/interface.h index 9a51524..11fbe74 100644 --- a/fasttext/interface.h +++ b/fasttext/interface.h @@ -35,8 +35,8 @@ class FastTextModel { std::vector getWords(); std::vector getVectorWrapper(std::string word); - std::vector classifierTest(std::string filename); - std::string classifierPredict(std::string text); + std::vector classifierTest(std::string filename, int32_t k); + std::vector classifierPredict(std::string text, int32_t k); void addWord(std::string word); void setDict(Dictionary dict); diff --git a/fasttext/interface.pxd b/fasttext/interface.pxd index 7a6fe34..212a3f6 100644 --- a/fasttext/interface.pxd +++ b/fasttext/interface.pxd @@ -35,8 +35,8 @@ cdef extern from "interface.h": vector[string] getWords() vector[real] getVectorWrapper(string word) - vector[double] classifierTest(string filename) - string classifierPredict(string text) + vector[double] classifierTest(string filename, int32_t k) + vector[string] classifierPredict(string text, int32_t k) Dictionary getDictionary() diff --git a/fasttext/model.py b/fasttext/model.py index 98b8ae6..b2ea8ea 100644 --- a/fasttext/model.py +++ b/fasttext/model.py @@ -2,7 +2,7 @@ import numpy as np from numpy.linalg import norm - +# Class for Skipgram and CBOW model class WordVectorModel(object): def __init__(self, model, words): self._model = model @@ -37,6 +37,7 @@ def cosine_similarity(self, first_word, second_word): cosine_sim = dot_product / (norm(v1) * norm(v2)) return cosine_sim +# Class for classifier model class SupervisedModel(object): def __init__(self, model, labels, label_prefix): self._model = model @@ -56,13 +57,24 @@ def __init__(self, model, labels, label_prefix): self.t = model.t; self.label_prefix = label_prefix; - def test(self, test_file): - return self._model.classifier_test(test_file) + def test(self, test_file, k=1): + return self._model.classifier_test(test_file, k) - def predict(self, texts): - labels = [] + def predict(self, texts, k=1): + all_labels = [] for text in texts: - label = self._model.classifier_predict(text) - labels.append(label.replace(self.label_prefix, '')) - return labels + labels = [] + raw_labels = self._model.classifier_predict(text, k=k) + for raw_label in raw_labels: + label = raw_label.replace(self.label_prefix, '') + labels.append(label) + all_labels.append(labels) + return all_labels + +# Class for test result +class ClassifierTestResult(object): + def __init__(self, precision, recall, nexamples): + self.precision = precision + self.recall = recall + self.nexamples = nexamples From d6fae1cd4f16333a11527c93ad3549f9267f4ef5 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 22 Aug 2016 16:22:16 +0700 Subject: [PATCH 051/109] Add test for k-best labels classifier --- Makefile | 8 +++- test/classifier_test.py | 99 ++++++++++++++++++++++++++++++----------- 2 files changed, 81 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index 4c8fb12..a1dd9bf 100644 --- a/Makefile +++ b/Makefile @@ -71,8 +71,14 @@ test/classifier_pred_result.txt: test/classifier.bin test/classifier_pred_test.txt > \ test/classifier_pred_result.txt +test/classifier_pred_k_result.txt: test/classifier.bin + ./fasttext/cpp/fasttext predict test/classifier.bin \ + test/classifier_pred_test.txt 5 > \ + test/classifier_pred_k_result.txt + test-classifier: fasttext/cpp/fasttext test/classifier.bin \ test/classifier_test_result.txt \ - test/classifier_pred_result.txt + test/classifier_pred_result.txt \ + test/classifier_pred_k_result.txt python test/classifier_test.py --verbose diff --git a/test/classifier_test.py b/test/classifier_test.py index 08e6186..b8cadb9 100644 --- a/test/classifier_test.py +++ b/test/classifier_test.py @@ -16,9 +16,11 @@ output = path.join(current_dir, 'generated_classifier') test_result = path.join(current_dir, 'classifier_test_result.txt') pred_result = path.join(current_dir, 'classifier_pred_result.txt') +pred_k_result = path.join(current_dir, 'classifier_pred_k_result.txt') test_file = path.join(current_dir, 'classifier_test.txt') -def read_labels(filename, label_prefix, unique=True): +# To validate model are loaded correctly +def read_labels_from_input(filename, label_prefix): labels = [] with open(filename, 'r') as f: for line in f: @@ -36,15 +38,42 @@ def read_labels(filename, label_prefix, unique=True): label = line.split(',', 1)[0].strip() label = label.replace(label_prefix, '') - if unique: - if label in labels: - continue - else: - labels.append(label) + if label in labels: + continue else: labels.append(label) return labels +# To validate model have the same prediction as fasttext(1) +def read_labels_from_result(filename, label_prefix): + all_labels = [] + with open(filename, 'r') as f: + for line in f: + try: + line = line.decode('utf-8') + except: + line = line + + labels = [] + raw_labels = line.split(' ') + for raw_label in raw_labels: + label = raw_label.replace(label_prefix, '') + labels.append(label.strip()) + all_labels.append(labels) + return all_labels + +# To read text data to predict +def read_texts(pred_file): + texts = [] + with open(pred_file, 'r') as f: + for line in f: + try: + line = line.decode('utf-8') + except: + line = line + texts.append(line) + return texts + # Test to make sure that classifier interface run correctly class TestClassifierModel(unittest.TestCase): def test_load_classifier_model(self): @@ -63,7 +92,7 @@ def test_load_classifier_model(self): self.assertEqual(model.bucket, 2000000) # Read labels from the the input_file - labels = read_labels(input_file, label_prefix) + labels = read_labels_from_input(input_file, label_prefix) # Make sure labels are loaded correctly self.assertTrue(sorted(model.labels) == sorted(labels)) @@ -93,7 +122,10 @@ def test_train_classifier(self): self.assertEqual(model.bucket, bucket) # Read labels from the the input_file - labels = read_labels(input_file, label_prefix) + labels = read_labels_from_input(input_file, label_prefix) + + # Make sure labels are loaded correctly + self.assertTrue(sorted(model.labels) == sorted(labels)) # Make sure .bin and .vec are generated self.assertTrue(path.isfile(output + '.bin')) @@ -102,45 +134,62 @@ def test_train_classifier(self): def test_classifier_test(self): # Read the test result from fasttext(1) using the same classifier model precision_at_one = 0.0 - num_examples = 0 + nexamples = 0 with open(test_result) as f: lines = f.readlines() precision_at_one = float(lines[0][5:].strip()) - num_examples = int(lines[1][20:].strip()) + recall_at_one = float(lines[1][5:].strip()) + nexamples = int(lines[2][20:].strip()) # Load and test using the same model and test set classifier = ft.load_model(classifier_bin, label_prefix='__label__') - p_at_1, num_ex = classifier.test(test_file) + result = classifier.test(test_file, k=1) # Make sure that the test result is the same as the result generated # by fasttext(1) - p_at_1 = float("{0:.2f}".format(p_at_1)) + p_at_1 = float("{0:.2f}".format(result.precision)) + r_at_1 = float("{0:.2f}".format(result.recall)) self.assertEqual(p_at_1, precision_at_one) - self.assertEqual(num_ex, num_examples) + self.assertEqual(r_at_1, recall_at_one) + self.assertEqual(result.nexamples, nexamples) def test_classifier_predict(self): - label_prefix = '__label__' # Load the pre-trained classifier + label_prefix = '__label__' classifier = ft.load_model(classifier_bin, label_prefix=label_prefix) - # Read texts from the pred_file, prediction made by fasttext(1) - texts = [] - with open(pred_file, 'r') as f: - for line in f: - try: - line = line.decode('utf-8') - except: - line = line - texts.append(line) + # Read prediction result from fasttext(1) + fasttext_labels = read_labels_from_result(pred_result, + label_prefix=label_prefix) + + # Read texts from the pred_file + texts = read_texts(pred_file) # Predict the labels - fasttext_labels = read_labels(pred_result, label_prefix=label_prefix, - unique=False) labels = classifier.predict(texts) # Make sure the returned labels are the same as predicted by # fasttext(1) self.assertTrue(labels == fasttext_labels) + def test_classifier_predict_k_best(self): + label_prefix = '__label__' + # Load the pre-trained classifier + classifier = ft.load_model(classifier_bin, label_prefix=label_prefix) + + # Read prediction result from fasttext(1) + fasttext_labels = read_labels_from_result(pred_k_result, + label_prefix=label_prefix) + + # Read texts from the pred_file + texts = read_texts(pred_file) + + # Predict the k-best labels + labels = classifier.predict(texts, k=5) + + # Make sure the returned labels are the same as predicted by + # fasttext(1) + self.assertTrue(labels == fasttext_labels) + if __name__ == '__main__': unittest.main() From c1f65de5275ea64eeb1b4865867a3f40f1160209 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 22 Aug 2016 16:22:48 +0700 Subject: [PATCH 052/109] Ignore prediction result from k-best label --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 54f7624..3416767 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,8 @@ facebookresearch-fasttext-* test/classifier_pred_result.txt test/classifier_test_result.txt test/dbpedia.train +test/classifier_pred_k_result.txt +test/dbpedia_csv/ # Misc TODO From 473bac179faddba879e4f685148a0bd449d1ab28 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 22 Aug 2016 16:23:48 +0700 Subject: [PATCH 053/109] Remove get_vector method from WordVectorModel class --- fasttext/model.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/fasttext/model.py b/fasttext/model.py index b2ea8ea..ba8e8c5 100644 --- a/fasttext/model.py +++ b/fasttext/model.py @@ -21,9 +21,6 @@ def __init__(self, model, words): self.lr_update_rate = model.lrUpdateRate; self.t = model.t; - def get_vector(self, word): - return self._model.get_vector(word) - def __getitem__(self, word): return self._model.get_vector(word) From fa47782a2845d5f2c5667c01043e1f2daaea6e66 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 22 Aug 2016 16:35:56 +0700 Subject: [PATCH 054/109] Remove get_vector from Skipgram & CBOW test --- test/cbow_test.py | 2 -- test/skipgram_test.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/test/cbow_test.py b/test/cbow_test.py index 0ae9dd5..d989e09 100644 --- a/test/cbow_test.py +++ b/test/cbow_test.py @@ -85,14 +85,12 @@ def test_train_cbow_model(self): # Make sure the vector have the right dimension self.assertEqual(len(model['the']), dim) - self.assertEqual(len(model.get_vector('the')), dim) # Make sure we support unicode character unicode_str = 'Καλημέρα' self.assertTrue(unicode_str in model.words) self.assertTrue(unicode_str in model) self.assertEqual(len(model[unicode_str]), model.dim) - self.assertEqual(len(model.get_vector(unicode_str)), model.dim) if __name__ == '__main__': unittest.main() diff --git a/test/skipgram_test.py b/test/skipgram_test.py index 52feb6d..1459e94 100644 --- a/test/skipgram_test.py +++ b/test/skipgram_test.py @@ -85,14 +85,12 @@ def test_train_skipgram_model(self): # Make sure the vector have the right dimension self.assertEqual(len(model['the']), dim) - self.assertEqual(len(model.get_vector('the')), dim) # Make sure we support unicode character unicode_str = 'Καλημέρα' self.assertTrue(unicode_str in model.words) self.assertTrue(unicode_str in model) self.assertEqual(len(model[unicode_str]), model.dim) - self.assertEqual(len(model.get_vector(unicode_str)), model.dim) if __name__ == '__main__': unittest.main() From 56adcdc0b22ae8e973bd9c6ceca1852238554e15 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 22 Aug 2016 16:38:46 +0700 Subject: [PATCH 055/109] Update README.md for v0.7 --- README.md | 69 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index d8ab67a..0ab7b29 100644 --- a/README.md +++ b/README.md @@ -71,8 +71,6 @@ The previously trained model can be used to compute word vectors for out-of-vocabulary words. ```python -print model.get_vector('king') -# or just use a nice syntax print model['king'] # get the vector of the word 'king' ``` @@ -132,12 +130,13 @@ equivalent as `fasttext(1)` command: This will output two files: `model.bin` and `model.vec`. Once the model was trained, we can evaluate it by computing the precision -at 1 (P@1) on a test set using `classifier.test` function: +at 1 (P@1) and the recall on a test set using `classifier.test` function: ```python -precision_at_one, nexamples = classifier.test('test.txt') -print 'P@1:', precision_at_one -print 'Number of examples:', nexamples +result = classifier.test('test.txt') +print 'P@1:', result.precision +print 'R@1:', result.recall +print 'Number of examples:', result.nexamples ``` This will print the same output to stdout as: @@ -145,6 +144,7 @@ This will print the same output to stdout as: ```shell ./fasttext test model.bin test.txt ``` + In order to obtain the most likely label for a list of text, we can use `classifer.predict` method: @@ -154,6 +154,13 @@ labels = classifier.predict(texts) print labels ``` +We can specify `k` value to get the k-best labels from classifier: + +```python +labels = classifier.predict(texts, k=3) +print labels +``` + This interface is equivalent as `fasttext(1)` predict command. The same model with the same input set will have the same prediction. @@ -203,7 +210,6 @@ model.minn # Min length of char ngram model.maxn # Max length of char ngram model.lr_update_rate # Rate of updates for the learning rate model.t # Value of sampling threshold -model.get_vector(word) # Get the vector of specified word model[word] # Get the vector of specified word ``` @@ -236,9 +242,16 @@ model and test set will produce the same value for the precision at one and the number of examples. ```python -precision_at_one, nexamples = classifier.test(test_file) +result = classifier.test(params) + +# Properties +result.precision # Precision at one +result.recall # Recall at one +result.nexamples # Number of test examples ``` +The param `k` is optional, and equal to `1` by default. + ### Predict the most-likely label of texts This interface is equivalent as `fasttext(1)` predict command. @@ -246,33 +259,39 @@ This interface is equivalent as `fasttext(1)` predict command. `texts` is an array of string ```python -labels = classifier.predict(texts) +labels = classifier.predict(texts, k) ``` +The param `k` is optional, and equal to `1` by default. + ### Attributes and methods for the classifier Classifier have the following atributes & methods ```python -classifier.labels # List of labels -classifier.label_prefix # Prefix of the label -classifier.dim # Size of word vector -classifier.ws # Size of context window -classifier.epoch # Number of epochs -classifier.min_count # Minimal number of word occurences -classifier.neg # Number of negative sampled -classifier.word_ngrams # Max length of word ngram -classifier.loss_name # Loss function name -classifier.bucket # Number of buckets -classifier.minn # Min length of char ngram -classifier.maxn # Max length of char ngram -classifier.lr_update_rate # Rate of updates for the learning rate -classifier.t # Value of sampling threshold -classifier.test(filename) # Test the classifier -classifier.predict(texts) # Predict the most likely label +classifier.labels # List of labels +classifier.label_prefix # Prefix of the label +classifier.dim # Size of word vector +classifier.ws # Size of context window +classifier.epoch # Number of epochs +classifier.min_count # Minimal number of word occurences +classifier.neg # Number of negative sampled +classifier.word_ngrams # Max length of word ngram +classifier.loss_name # Loss function name +classifier.bucket # Number of buckets +classifier.minn # Min length of char ngram +classifier.maxn # Max length of char ngram +classifier.lr_update_rate # Rate of updates for the learning rate +classifier.t # Value of sampling threshold +classifier.test(filename, k) # Test the classifier +classifier.predict(texts, k) # Predict the most likely label + ``` +The param `k` for `classifier.test` and `classifier.predict` is optional, +and equal to `1` by default. + ### Params List of available `params` and their default value: From a4714e54cf0ddf3ef536aba1faede0f0793e89d0 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 22 Aug 2016 16:39:06 +0700 Subject: [PATCH 056/109] Update v0.6.4 to v0.7 --- fasttext/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fasttext/VERSION b/fasttext/VERSION index d2b13eb..faef31a 100644 --- a/fasttext/VERSION +++ b/fasttext/VERSION @@ -1 +1 @@ -0.6.4 +0.7.0 From da8c2659b0ca1d29eebf86523a9a4b93767dfa11 Mon Sep 17 00:00:00 2001 From: Renaud Richardet Date: Mon, 22 Aug 2016 12:17:11 +0200 Subject: [PATCH 057/109] typo --- examples/classification_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/classification_example.py b/examples/classification_example.py index d699436..58234f3 100644 --- a/examples/classification_example.py +++ b/examples/classification_example.py @@ -1,4 +1,4 @@ -import fasttext +import fasttext as ft # Download the dbpedia.train first on test/ # and move to the example directory From 123dcb1ea7eedbbcbb10acca4412c1cba00a0694 Mon Sep 17 00:00:00 2001 From: Renaud Richardet Date: Mon, 22 Aug 2016 14:36:29 +0200 Subject: [PATCH 058/109] info about where to obtain the training data --- examples/classification_example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/classification_example.py b/examples/classification_example.py index 58234f3..3eb10eb 100644 --- a/examples/classification_example.py +++ b/examples/classification_example.py @@ -1,7 +1,7 @@ import fasttext as ft -# Download the dbpedia.train first on test/ -# and move to the example directory +# Fist download the dbpedia.train using https://github.com/facebookresearch/fastText/blob/master/classification-example.sh +# on test/ and move to the example directory current_dir = path.dirname(__file__) input_file = path.join(current_dir, 'dbpedia.train') output = '/tmp/classifier' From 3916462e275310ae68e74d798ad49999e08062c3 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 22 Aug 2016 23:11:08 +0700 Subject: [PATCH 059/109] fastText: Update 1826a12 to 3223526 --- fasttext/cpp/LAST_COMMIT | 2 +- fasttext/cpp/src/dictionary.cc | 17 ++++++++++------- fasttext/cpp/src/dictionary.h | 2 +- fasttext/cpp/src/fasttext.cc | 4 ++-- update-fasttext.sh | 2 +- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/fasttext/cpp/LAST_COMMIT b/fasttext/cpp/LAST_COMMIT index bed3022..6a458e6 100644 --- a/fasttext/cpp/LAST_COMMIT +++ b/fasttext/cpp/LAST_COMMIT @@ -1 +1 @@ -1826a12 +3223526 diff --git a/fasttext/cpp/src/dictionary.cc b/fasttext/cpp/src/dictionary.cc index 266bc27..19a80b7 100644 --- a/fasttext/cpp/src/dictionary.cc +++ b/fasttext/cpp/src/dictionary.cc @@ -147,30 +147,33 @@ void Dictionary::initNgrams() { } } -std::string Dictionary::readWord(std::ifstream& fin) +bool Dictionary::readWord(std::ifstream& fin, std::string& word) { char c; - std::string word; + word.clear(); while (fin.peek() != EOF) { fin.get(c); if (isspace(c) || c == 0) { if (word.empty()) { - if (c == '\n') return EOS; + if (c == '\n') { + word += EOS; + return true; + } continue; } else { if (c == '\n') fin.unget(); - return word; + return true; } } word.push_back(c); } - return word; + return !word.empty(); } void Dictionary::readFromFile(std::ifstream& ifs) { std::string word; int64_t minThreshold = 1; - while (!(word = readWord(ifs)).empty()) { + while (readWord(ifs, word)) { add(word); if (ntokens_ % 1000000 == 0) { std::cout << "\rRead " << ntokens_ / 1000000 << "M words" << std::flush; @@ -248,7 +251,7 @@ int32_t Dictionary::getLine(std::ifstream& ifs, ifs.clear(); ifs.seekg(std::streampos(0)); } - while (!(token = readWord(ifs)).empty()) { + while (readWord(ifs, token)) { if (token == EOS) break; int32_t wid = getId(token); if (wid < 0) continue; diff --git a/fasttext/cpp/src/dictionary.h b/fasttext/cpp/src/dictionary.h index 9608ac0..7d2fe26 100644 --- a/fasttext/cpp/src/dictionary.h +++ b/fasttext/cpp/src/dictionary.h @@ -63,7 +63,7 @@ class Dictionary { void computeNgrams(const std::string&, std::vector&); uint32_t hash(const std::string& str); void add(const std::string&); - std::string readWord(std::ifstream&); + bool readWord(std::ifstream&, std::string&); void readFromFile(std::ifstream&); std::string getLabel(int32_t); void save(std::ofstream&); diff --git a/fasttext/cpp/src/fasttext.cc b/fasttext/cpp/src/fasttext.cc index 38397a9..7a7cc47 100644 --- a/fasttext/cpp/src/fasttext.cc +++ b/fasttext/cpp/src/fasttext.cc @@ -393,8 +393,8 @@ void train(int argc, char** argv) { double trainTime = difftime(time(nullptr), t0); std::cout << "Train time: " << trainTime << " sec" << std::endl; - if (args.output.size() != 0) { - saveModel(dict, input, output); + saveModel(dict, input, output); + if (args.model != model_name::sup) { saveVectors(dict, input, output); } } diff --git a/update-fasttext.sh b/update-fasttext.sh index ffbb148..b502385 100644 --- a/update-fasttext.sh +++ b/update-fasttext.sh @@ -1,4 +1,4 @@ -NEW_VERSION=1826a12 +NEW_VERSION=3223526 CURRENT_VERSION=$(cat fasttext/cpp/LAST_COMMIT) if [ "$NEW_VERSION" = "$CURRENT_VERSION" ]; then From 4c1f783430c9983dca4d77ca80b2c4ca8b25904e Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 22 Aug 2016 23:17:31 +0700 Subject: [PATCH 060/109] Update README.rst --- README.rst | 68 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/README.rst b/README.rst index 8184104..1129921 100644 --- a/README.rst +++ b/README.rst @@ -79,8 +79,6 @@ out-of-vocabulary words. .. code:: python - print model.get_vector('king') - # or just use a nice syntax print model['king'] # get the vector of the word 'king' the following ``fasttext(1)`` command is equivalent: @@ -141,13 +139,15 @@ equivalent as ``fasttext(1)`` command: This will output two files: ``model.bin`` and ``model.vec``. Once the model was trained, we can evaluate it by computing the -precision at 1 (P@1) on a test set using ``classifier.test`` function: +precision at 1 (P@1) and the recall on a test set using +``classifier.test`` function: .. code:: python - precision_at_one, nexamples = classifier.test('test.txt') - print 'P@1:', precision_at_one - print 'Number of examples:', nexamples + result = classifier.test('test.txt') + print 'P@1:', result.precision + print 'R@1:', result.recall + print 'Number of examples:', result.nexamples This will print the same output to stdout as: @@ -164,6 +164,13 @@ In order to obtain the most likely label for a list of text, we can use labels = classifier.predict(texts) print labels +We can specify ``k`` value to get the k-best labels from classifier: + +.. code:: python + + labels = classifier.predict(texts, k=3) + print labels + This interface is equivalent as ``fasttext(1)`` predict command. The same model with the same input set will have the same prediction. @@ -219,7 +226,6 @@ Skipgram and CBOW model have the following atributes & methods model.maxn # Max length of char ngram model.lr_update_rate # Rate of updates for the learning rate model.t # Value of sampling threshold - model.get_vector(word) # Get the vector of specified word model[word] # Get the vector of specified word Supervised model @@ -254,7 +260,14 @@ one and the number of examples. .. code:: python - precision_at_one, nexamples = classifier.test(test_file) + result = classifier.test(params) + + # Properties + result.precision # Precision at one + result.recall # Recall at one + result.nexamples # Number of test examples + +The param ``k`` is optional, and equal to ``1`` by default. Predict the most-likely label of texts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -265,7 +278,9 @@ This interface is equivalent as ``fasttext(1)`` predict command. .. code:: python - labels = classifier.predict(texts) + labels = classifier.predict(texts, k) + +The param ``k`` is optional, and equal to ``1`` by default. Attributes and methods for the classifier ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -274,22 +289,25 @@ Classifier have the following atributes & methods .. code:: python - classifier.labels # List of labels - classifier.label_prefix # Prefix of the label - classifier.dim # Size of word vector - classifier.ws # Size of context window - classifier.epoch # Number of epochs - classifier.min_count # Minimal number of word occurences - classifier.neg # Number of negative sampled - classifier.word_ngrams # Max length of word ngram - classifier.loss_name # Loss function name - classifier.bucket # Number of buckets - classifier.minn # Min length of char ngram - classifier.maxn # Max length of char ngram - classifier.lr_update_rate # Rate of updates for the learning rate - classifier.t # Value of sampling threshold - classifier.test(filename) # Test the classifier - classifier.predict(texts) # Predict the most likely label + classifier.labels # List of labels + classifier.label_prefix # Prefix of the label + classifier.dim # Size of word vector + classifier.ws # Size of context window + classifier.epoch # Number of epochs + classifier.min_count # Minimal number of word occurences + classifier.neg # Number of negative sampled + classifier.word_ngrams # Max length of word ngram + classifier.loss_name # Loss function name + classifier.bucket # Number of buckets + classifier.minn # Min length of char ngram + classifier.maxn # Max length of char ngram + classifier.lr_update_rate # Rate of updates for the learning rate + classifier.t # Value of sampling threshold + classifier.test(filename, k) # Test the classifier + classifier.predict(texts, k) # Predict the most likely label + +The param ``k`` for ``classifier.test`` and ``classifier.predict`` is +optional, and equal to ``1`` by default. Params ~~~~~~ From 7ecbbf9eed792e5edc4231781f1aeb8c7e681983 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 22 Aug 2016 23:55:11 +0700 Subject: [PATCH 061/109] Add pre-test target to make sure test env are clean before running test --- Makefile | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index a1dd9bf..6c286a0 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,11 @@ install-dev: README.rst python setup.py develop .PHONY: install-dev +pre-test: + # Remove generated file from test + rm test/*.vec test/*.bin test/*_result.txt +.PHONY: pre-test + fasttext/cpp/fasttext: make --directory fasttext/cpp/ @@ -36,7 +41,7 @@ test/skipgram_params_test.bin: -minCount 1 -neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 \ -thread 4 -lrUpdateRate 100 -t 1e-4 >> /dev/null -test-skipgram: fasttext/cpp/fasttext test/skipgram_params_test.bin +test-skipgram: pre-test fasttext/cpp/fasttext test/skipgram_params_test.bin python test/skipgram_test.py --verbose # Test for cbow model @@ -48,7 +53,7 @@ test/cbow_params_test.bin: -minCount 1 -neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 \ -thread 4 -lrUpdateRate 100 -t 1e-4 >> /dev/null -test-cbow: fasttext/cpp/fasttext test/cbow_params_test.bin +test-cbow: pre-test fasttext/cpp/fasttext test/cbow_params_test.bin python test/cbow_test.py --verbose # Test for classifier @@ -76,7 +81,7 @@ test/classifier_pred_k_result.txt: test/classifier.bin test/classifier_pred_test.txt 5 > \ test/classifier_pred_k_result.txt -test-classifier: fasttext/cpp/fasttext test/classifier.bin \ +test-classifier: pre-test fasttext/cpp/fasttext test/classifier.bin \ test/classifier_test_result.txt \ test/classifier_pred_result.txt \ test/classifier_pred_k_result.txt From 7ee1a6058978bd30df9a1c359af89f6949084cd5 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 22 Aug 2016 23:55:45 +0700 Subject: [PATCH 062/109] Update classifier test, model.vec is not generated anymore --- test/classifier_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/classifier_test.py b/test/classifier_test.py index b8cadb9..0ae9ffd 100644 --- a/test/classifier_test.py +++ b/test/classifier_test.py @@ -129,7 +129,6 @@ def test_train_classifier(self): # Make sure .bin and .vec are generated self.assertTrue(path.isfile(output + '.bin')) - self.assertTrue(path.isfile(output + '.vec')) def test_classifier_test(self): # Read the test result from fasttext(1) using the same classifier model From abab1110aea1c594ab3b18dfbfc3ab2db4b77b10 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Tue, 23 Aug 2016 00:03:50 +0700 Subject: [PATCH 063/109] Update example for the classifier model --- examples/classification_example.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/classification_example.py b/examples/classification_example.py index 3eb10eb..7dd09ec 100644 --- a/examples/classification_example.py +++ b/examples/classification_example.py @@ -24,9 +24,10 @@ thread=thread, silent=silent, label_prefix=label_prefix) # Test the classifier -p_at_1, num_ex = classifier.test(test_file) -print 'P@1:', p_at_1 -print 'Number of examples:', num_ex +result = classifier.test(test_file) +print 'P@1:', result.precision +print 'R@1:', result.recall +print 'Number of examples:', result.nexamples # Predict some text # (Example text is from dbpedia.train) From 9a70502d24ff0cf610def66b22948abe64547c39 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Tue, 23 Aug 2016 00:07:05 +0700 Subject: [PATCH 064/109] Update pre-test target, rm files if exists --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6c286a0..313a4f0 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ install-dev: README.rst pre-test: # Remove generated file from test - rm test/*.vec test/*.bin test/*_result.txt + rm -f test/*.vec test/*.bin test/*_result.txt .PHONY: pre-test fasttext/cpp/fasttext: From 31f9639ce8d933633bd7719705305d26606340f7 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Tue, 23 Aug 2016 00:38:01 +0700 Subject: [PATCH 065/109] Update v0.7.0 to v0.7.1 [skip ci] --- fasttext/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fasttext/VERSION b/fasttext/VERSION index faef31a..39e898a 100644 --- a/fasttext/VERSION +++ b/fasttext/VERSION @@ -1 +1 @@ -0.7.0 +0.7.1 From 8eca177d6ce706bf7b0ac32bd7d81177a0fdc53c Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 25 Aug 2016 03:38:04 +0700 Subject: [PATCH 066/109] Update the default params --- fasttext/__init__.py | 1 + fasttext/fasttext.pyx | 59 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/fasttext/__init__.py b/fasttext/__init__.py index 6af7367..f384a42 100644 --- a/fasttext/__init__.py +++ b/fasttext/__init__.py @@ -2,6 +2,7 @@ from .fasttext import cbow from .fasttext import load_model from .fasttext import supervised +from .fasttext import default_args @property def __VERSION__(): diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index 90d3737..0960296 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -202,29 +202,68 @@ def train_wrapper(model_name, input_file, output, label_prefix, lr, dim, ws, return model +# Default value from fasttext(1) +LR=0.05 +DIM=100 +WS=5 +EPOCH=5 +MINCOUNT=5 +NEG=5 +WORDNGRAMS=1 +LOSS='ns' +BUCKET=2000000 +MINN=3 +MAXN=6 +THREAD=12 +LRUPDATERATE=100 +T=1e-4 +SILENT=1 +LABELPREFIX='__label__' + +# Set as dictionary for easier to lookup & test +default_args = { + 'lr': LR, + 'dim': DIM, + 'ws': WS, + 'epoch': EPOCH, + 'minCount': MINCOUNT, + 'neg': NEG, + 'wordNgrams': WORDNGRAMS, + 'loss': LOSS, + 'bucket': BUCKET, + 'minn': MINN, + 'maxn': MAXN, + 'thread': THREAD, + 'lrUpdateRate': LRUPDATERATE, + 't': T, + 'label': LABELPREFIX +} + # Learn word representation using skipgram model -def skipgram(input_file, output, lr=0.05, dim=100, ws=5, epoch=5, min_count=5, - neg=5, word_ngrams=1, loss='ns', bucket=2000000, minn=3, maxn=6, - thread=12, lr_update_rate=100, t=1e-4, silent=1): +def skipgram(input_file, output, lr=LR, dim=DIM, ws=WS, epoch=EPOCH, + min_count=MINCOUNT, neg=NEG, word_ngrams=WORDNGRAMS, loss=LOSS, + bucket=BUCKET, minn=MINN, maxn=MAXN, thread=THREAD, + lr_update_rate=LRUPDATERATE, t=T, silent=SILENT): label_prefix = '' return train_wrapper('skipgram', input_file, output, label_prefix, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate, t, silent) # Learn word representation using CBOW model -def cbow(input_file, output, lr=0.05, dim=100, ws=5, epoch=5, min_count=5, - neg=5, word_ngrams=1, loss='ns', bucket=2000000, minn=3, maxn=6, - thread=12, lr_update_rate=100, t=1e-4, silent=1): +def cbow(input_file, output, lr=LR, dim=DIM, ws=WS, epoch=EPOCH, + min_count=MINCOUNT, neg=NEG, word_ngrams=WORDNGRAMS, loss=LOSS, + bucket=BUCKET, minn=MINN, maxn=MAXN, thread=THREAD, + lr_update_rate=LRUPDATERATE, t=T, silent=SILENT): label_prefix = '' return train_wrapper('cbow', input_file, output, label_prefix, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate, t, silent) # Train classifier -def supervised(input_file, output, label_prefix='__label__', lr=0.05, dim=100, - ws=5, epoch=5, min_count=5, neg=5, word_ngrams=1, loss='ns', - bucket=2000000, minn=3, maxn=6, thread=12, lr_update_rate=100, - t=1e-4, silent=1): +def supervised(input_file, output, label_prefix=LABELPREFIX, lr=LR, dim=DIM, + ws=WS, epoch=EPOCH, min_count=MINCOUNT, neg=NEG, word_ngrams=WORDNGRAMS, + loss=LOSS, bucket=BUCKET, minn=MINN, maxn=MAXN, thread=THREAD, + lr_update_rate=LRUPDATERATE, t=T, silent=SILENT): return train_wrapper('supervised', input_file, output, label_prefix, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate, t, silent) From 52e359c47711513f9bed8fc90eb7c4f8460f5bda Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 25 Aug 2016 03:38:35 +0700 Subject: [PATCH 067/109] Add default params test --- Makefile | 6 ++- test/Makefile | 18 +++++++++ test/default_params_test.cc | 34 ++++++++++++++++ test/default_params_test.py | 78 +++++++++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 test/Makefile create mode 100644 test/default_params_test.cc create mode 100644 test/default_params_test.py diff --git a/Makefile b/Makefile index 313a4f0..467092a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: install test -test: test-skipgram test-cbow test-classifier +test: test-skipgram test-cbow test-classifier test-default-params buildext: python setup.py build_ext --inplace @@ -87,3 +87,7 @@ test-classifier: pre-test fasttext/cpp/fasttext test/classifier.bin \ test/classifier_pred_k_result.txt python test/classifier_test.py --verbose +# Default params test +test-default-params: + $(MAKE) --directory test + python test/default_params_test.py --verbose diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 0000000..267b5f5 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,18 @@ +# Makefile for building test written in C++ +CXX = c++ +CXXFLAGS = -pthread -std=c++0x + +default: default_params_test default_params_result.txt + +args.o: ../fasttext/cpp/args.o +.PHONY: args.o + +../fasttext/cpp/args.o: + $(MAKE) --directory ../fasttext/cpp/ + +default_params_test: args.o default_params_test.cc + $(CXX) $(CXXFLAGS) ../fasttext/cpp/args.o \ + default_params_test.cc -o default_params_test + +default_params_result.txt: default_params_test + ./default_params_test > default_params_result.txt diff --git a/test/default_params_test.cc b/test/default_params_test.cc new file mode 100644 index 0000000..aeac0d4 --- /dev/null +++ b/test/default_params_test.cc @@ -0,0 +1,34 @@ +/* To print the value of default params from fasttext(1) */ +#include + +#include "../fasttext/cpp/src/args.h" + +int main(int argc, char **argv) +{ + Args args; + std::cout << "lr " << args.lr << std::endl; + std::cout << "dim " << args.dim << std::endl; + std::cout << "ws " << args.ws << std::endl; + std::cout << "epoch " << args.epoch << std::endl; + std::cout << "minCount " << args.minCount << std::endl; + std::cout << "neg " << args.neg << std::endl; + std::cout << "wordNgrams " << args.wordNgrams << std::endl; + std::string lossName; + if(args.loss == loss_name::ns) { + lossName = "ns"; + } + if(args.loss == loss_name::hs) { + lossName = "hs"; + } + if(args.loss == loss_name::softmax) { + lossName = "softmax"; + } + std::cout << "loss " << lossName << std::endl; + std::cout << "bucket " << args.bucket << std::endl; + std::cout << "minn " << args.minn << std::endl; + std::cout << "maxn " << args.maxn << std::endl; + std::cout << "thread " << args.thread << std::endl; + std::cout << "lrUpdateRate " << args.lrUpdateRate << std::endl; + std::cout << "t " << args.t << std::endl; + std::cout << "label " << args.label << std::endl; +} diff --git a/test/default_params_test.py b/test/default_params_test.py new file mode 100644 index 0000000..17c2f81 --- /dev/null +++ b/test/default_params_test.py @@ -0,0 +1,78 @@ +# Set encoding to support Python 2 +# -*- coding: utf-8 -*- + +import unittest +from os import path + +import fasttext as ft + +current_dir = path.dirname(__file__) +params_txt = path.join(current_dir, 'default_params_result.txt') + +# Test to make sure that default params is equivalent as fastetxt(1) +class TestDefaultParams(unittest.TestCase): + def test_default_params(self): + default_args = {} + with open(params_txt, 'r') as f: + for line in f: + try: + line = line.decode('utf-8') + except: + line = line + + raw = line.split(' ') + key = raw[0] + value = raw[1].strip() + default_args[key] = value + + # Make sure the default value of learning rate is correct + self.assertEqual(ft.default_args['lr'], float(default_args['lr'])) + + # Make sure the default value of the dimension is correct + self.assertEqual(ft.default_args['dim'], int(default_args['dim'])) + + # Make sure the default value of ws is correct + self.assertEqual(ft.default_args['ws'], int(default_args['ws'])) + + # Make sure the default value of epoch is correct + self.assertEqual(ft.default_args['epoch'], int(default_args['epoch'])) + + # Make sure the default value of minCount is correct + self.assertEqual(ft.default_args['minCount'], + int(default_args['minCount'])) + + # Make sure the default value of neg is correct + self.assertEqual(ft.default_args['neg'], int(default_args['neg'])) + + # Make sure the default value of wordNgrams is correct + self.assertEqual(ft.default_args['wordNgrams'], + int(default_args['wordNgrams'])) + + # Make sure the default value of loss is correct + self.assertEqual(ft.default_args['loss'], default_args['loss']) + + # Make sure the default value of bucket is correct + self.assertEqual(ft.default_args['bucket'], + int(default_args['bucket'])) + + # Make sure the default value of minn is correct + self.assertEqual(ft.default_args['minn'], int(default_args['minn'])) + + # Make sure the default value of maxn is correct + self.assertEqual(ft.default_args['maxn'], int(default_args['maxn'])) + + # Make sure the default value of thread is correct + self.assertEqual(ft.default_args['thread'], int(default_args['thread'])) + + # Make sure the default value of lrUpdateRate is correct + self.assertEqual(ft.default_args['lrUpdateRate'], + float(default_args['lrUpdateRate'])) + + # Make sure the default value of t is correct + self.assertEqual(ft.default_args['t'], float(default_args['t'])) + + # Make sure the default value of label is correct + self.assertEqual(ft.default_args['label'], default_args['label']) + +if __name__ == '__main__': + unittest.main() From b83a7e5cf64244346f43c8fbc674490745ec292b Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 25 Aug 2016 03:38:51 +0700 Subject: [PATCH 068/109] Ignore files from default params test --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 3416767..8df159a 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,8 @@ test/classifier_test_result.txt test/dbpedia.train test/classifier_pred_k_result.txt test/dbpedia_csv/ +test/default_params_result.txt +test/default_params_test # Misc TODO From e96b801349f0c3d7bc4dffd283b12fa8e97f4f0b Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 25 Aug 2016 03:43:23 +0700 Subject: [PATCH 069/109] Update v0.7.1 to v0.7.2 --- fasttext/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fasttext/VERSION b/fasttext/VERSION index 39e898a..7486fdb 100644 --- a/fasttext/VERSION +++ b/fasttext/VERSION @@ -1 +1 @@ -0.7.1 +0.7.2 From fe5fcd38a52082b4d0cc1136196a765c64cbe2b3 Mon Sep 17 00:00:00 2001 From: Mengxuan Xia Date: Thu, 25 Aug 2016 12:11:53 -0400 Subject: [PATCH 070/109] Fix C++ code compilation problem on OSX (#55) * Fix C++ code compilation problem on OSX * Fix linux/windows build regression --- setup.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 62a50bb..d4a25a0 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ from setuptools import setup, find_packages from setuptools.extension import Extension from Cython.Build import cythonize +from sys import platform import unittest # Read the fastText.py version @@ -9,6 +10,12 @@ def read_version(): return f.read().strip() # Define the C++ extension + +if platform == "darwin": + extra_compile_args = ['-O3', '-pthread', '-funroll-loops', '-std=c++0x', '-stdlib=libc++', '-mmacosx-version-min=10.7'] +else: + extra_compile_args = ['-O3', '-pthread', '-funroll-loops', '-std=c++0x'] + extensions = [ Extension('*', sources=[ @@ -22,7 +29,7 @@ def read_version(): 'fasttext/cpp/src/vector.cc' ], language='c++', - extra_compile_args=['-pthread', '-funroll-loops', '-std=c++0x']) + extra_compile_args=extra_compile_args) ] # Package details From 860d4a79cb7d21f2d40c35bf11579e4ba4a3c3d0 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 29 Aug 2016 14:21:58 +0700 Subject: [PATCH 071/109] Update default params test --- Makefile | 27 +++++++++---- fasttext/__init__.py | 1 - fasttext/fasttext.pyx | 59 +++++----------------------- test/Makefile | 13 +++++++ test/cbow_test.py | 36 +++++++++++++++-- test/classifier_test.py | 48 ++++++++++++++++++----- test/default_params.py | 17 ++++++++ test/default_params_test.cc | 1 + test/default_params_test.py | 78 ------------------------------------- test/skipgram_test.py | 36 +++++++++++++++-- 10 files changed, 165 insertions(+), 151 deletions(-) create mode 100644 test/default_params.py delete mode 100644 test/default_params_test.py diff --git a/Makefile b/Makefile index 467092a..21803ba 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: install test -test: test-skipgram test-cbow test-classifier test-default-params +test: test-skipgram test-cbow test-classifier buildext: python setup.py build_ext --inplace @@ -41,7 +41,12 @@ test/skipgram_params_test.bin: -minCount 1 -neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 \ -thread 4 -lrUpdateRate 100 -t 1e-4 >> /dev/null -test-skipgram: pre-test fasttext/cpp/fasttext test/skipgram_params_test.bin +# Generate default value of skipgram command from fasttext(1) +test/skipgram_default_params_result.txt: + $(MAKE) skipgram_default_params_result.txt --directory test/ + +test-skipgram: pre-test fasttext/cpp/fasttext test/skipgram_params_test.bin \ + test/skipgram_default_params_result.txt python test/skipgram_test.py --verbose # Test for cbow model @@ -53,7 +58,12 @@ test/cbow_params_test.bin: -minCount 1 -neg 5 -loss ns -bucket 2000000 -minn 3 -maxn 6 \ -thread 4 -lrUpdateRate 100 -t 1e-4 >> /dev/null -test-cbow: pre-test fasttext/cpp/fasttext test/cbow_params_test.bin +# Generate default value of cbow command from fasttext(1) +test/cbow_default_params_result.txt: + $(MAKE) cbow_default_params_result.txt --directory test/ + +test-cbow: pre-test fasttext/cpp/fasttext test/cbow_params_test.bin \ + test/cbow_default_params_result.txt python test/cbow_test.py --verbose # Test for classifier @@ -81,13 +91,14 @@ test/classifier_pred_k_result.txt: test/classifier.bin test/classifier_pred_test.txt 5 > \ test/classifier_pred_k_result.txt +# Generate default value of classifier command from fasttext(1) +test/classifier_default_params_result.txt: + $(MAKE) classifier_default_params_result.txt --directory test/ + test-classifier: pre-test fasttext/cpp/fasttext test/classifier.bin \ test/classifier_test_result.txt \ test/classifier_pred_result.txt \ - test/classifier_pred_k_result.txt + test/classifier_pred_k_result.txt \ + test/classifier_default_params_result.txt python test/classifier_test.py --verbose -# Default params test -test-default-params: - $(MAKE) --directory test - python test/default_params_test.py --verbose diff --git a/fasttext/__init__.py b/fasttext/__init__.py index f384a42..6af7367 100644 --- a/fasttext/__init__.py +++ b/fasttext/__init__.py @@ -2,7 +2,6 @@ from .fasttext import cbow from .fasttext import load_model from .fasttext import supervised -from .fasttext import default_args @property def __VERSION__(): diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index 0960296..f6bbdce 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -202,68 +202,29 @@ def train_wrapper(model_name, input_file, output, label_prefix, lr, dim, ws, return model -# Default value from fasttext(1) -LR=0.05 -DIM=100 -WS=5 -EPOCH=5 -MINCOUNT=5 -NEG=5 -WORDNGRAMS=1 -LOSS='ns' -BUCKET=2000000 -MINN=3 -MAXN=6 -THREAD=12 -LRUPDATERATE=100 -T=1e-4 -SILENT=1 -LABELPREFIX='__label__' - -# Set as dictionary for easier to lookup & test -default_args = { - 'lr': LR, - 'dim': DIM, - 'ws': WS, - 'epoch': EPOCH, - 'minCount': MINCOUNT, - 'neg': NEG, - 'wordNgrams': WORDNGRAMS, - 'loss': LOSS, - 'bucket': BUCKET, - 'minn': MINN, - 'maxn': MAXN, - 'thread': THREAD, - 'lrUpdateRate': LRUPDATERATE, - 't': T, - 'label': LABELPREFIX -} - # Learn word representation using skipgram model -def skipgram(input_file, output, lr=LR, dim=DIM, ws=WS, epoch=EPOCH, - min_count=MINCOUNT, neg=NEG, word_ngrams=WORDNGRAMS, loss=LOSS, - bucket=BUCKET, minn=MINN, maxn=MAXN, thread=THREAD, - lr_update_rate=LRUPDATERATE, t=T, silent=SILENT): +def skipgram(input_file, output, lr=0.05, dim=100, ws=5, epoch=5, min_count=5, + neg=5, word_ngrams=1, loss='ns', bucket=2000000, minn=3, maxn=6, + thread=12, lr_update_rate=100, t=1e-4, silent=1): label_prefix = '' return train_wrapper('skipgram', input_file, output, label_prefix, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate, t, silent) # Learn word representation using CBOW model -def cbow(input_file, output, lr=LR, dim=DIM, ws=WS, epoch=EPOCH, - min_count=MINCOUNT, neg=NEG, word_ngrams=WORDNGRAMS, loss=LOSS, - bucket=BUCKET, minn=MINN, maxn=MAXN, thread=THREAD, - lr_update_rate=LRUPDATERATE, t=T, silent=SILENT): +def cbow(input_file, output, lr=0.05, dim=100, ws=5, epoch=5, min_count=5, + neg=5, word_ngrams=1, loss='ns', bucket=2000000, minn=3, maxn=6, + thread=12, lr_update_rate=100, t=1e-4, silent=1): label_prefix = '' return train_wrapper('cbow', input_file, output, label_prefix, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate, t, silent) # Train classifier -def supervised(input_file, output, label_prefix=LABELPREFIX, lr=LR, dim=DIM, - ws=WS, epoch=EPOCH, min_count=MINCOUNT, neg=NEG, word_ngrams=WORDNGRAMS, - loss=LOSS, bucket=BUCKET, minn=MINN, maxn=MAXN, thread=THREAD, - lr_update_rate=LRUPDATERATE, t=T, silent=SILENT): +def supervised(input_file, output, label_prefix='__label__', lr=0.05, dim=100, + ws=5, epoch=5, min_count=1, neg=5, word_ngrams=1, loss='softmax', + bucket=2000000, minn=3, maxn=6, thread=12, lr_update_rate=100, + t=1e-4, silent=1): return train_wrapper('supervised', input_file, output, label_prefix, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, lr_update_rate, t, silent) diff --git a/test/Makefile b/test/Makefile index 267b5f5..b86ed11 100644 --- a/test/Makefile +++ b/test/Makefile @@ -16,3 +16,16 @@ default_params_test: args.o default_params_test.cc default_params_result.txt: default_params_test ./default_params_test > default_params_result.txt + +skipgram_default_params_result.txt: default_params_test + ./default_params_test skipgram -input test -output test \ + > skipgram_default_params_result.txt + +cbow_default_params_result.txt: default_params_test + ./default_params_test cbow -input test -output test \ + > cbow_default_params_result.txt + +classifier_default_params_result.txt: default_params_test + ./default_params_test supervised -input test -output test \ + > classifier_default_params_result.txt + diff --git a/test/cbow_test.py b/test/cbow_test.py index d989e09..de1d107 100644 --- a/test/cbow_test.py +++ b/test/cbow_test.py @@ -9,9 +9,13 @@ import fasttext as ft -cbow_file = path.join(path.dirname(__file__), 'cbow_params_test.bin') -input_file = path.join(path.dirname(__file__), 'params_test.txt') -output = path.join(path.dirname(__file__), 'generated_cbow') +import default_params + +test_dir = path.dirname(__file__) +cbow_file = path.join(test_dir, 'cbow_params_test.bin') +input_file = path.join(test_dir, 'params_test.txt') +output = path.join(test_dir, 'generated_cbow') +params_txt = path.join(test_dir, 'cbow_default_params_result.txt') # Test to make sure that cbow interface run correctly class TestCBOWModel(unittest.TestCase): @@ -43,6 +47,32 @@ def test_load_cbow_model(self): self.assertTrue(unicode_str in model.words) self.assertEqual(len(model[unicode_str]), model.dim) + def test_load_invalid_cbow_model(self): + # Make sure we are throwing an exception + with self.assertRaises(ValueError): + ft.load_model('/path/to/invalid') + + def test_train_cbow_model_default(self): + default_args = default_params.read_file(params_txt) + model = ft.cbow(input_file, output) + + # Make sure the default params of cbow is equal + # to fasttext(1) default params + self.assertEqual(model.model_name, 'cbow') + self.assertEqual(model.dim, int(default_args['dim'])) + self.assertEqual(model.ws, int(default_args['ws'])) + self.assertEqual(model.epoch, int(default_args['epoch'])) + self.assertEqual(model.min_count, int(default_args['minCount'])) + self.assertEqual(model.neg, int(default_args['neg'])) + self.assertEqual(model.word_ngrams, int(default_args['wordNgrams'])) + self.assertEqual(model.loss_name, default_args['loss']) + self.assertEqual(model.bucket, int(default_args['bucket'])) + self.assertEqual(model.minn, int(default_args['minn'])) + self.assertEqual(model.maxn, int(default_args['maxn'])) + self.assertEqual(model.lr_update_rate, + float(default_args['lrUpdateRate'])) + self.assertEqual(model.t, float(default_args['t'])) + def test_train_cbow_model(self): # set params lr=0.005 diff --git a/test/classifier_test.py b/test/classifier_test.py index 0ae9ffd..6acf391 100644 --- a/test/classifier_test.py +++ b/test/classifier_test.py @@ -9,15 +9,18 @@ import fasttext as ft -current_dir = path.dirname(__file__) -classifier_bin = path.join(current_dir, 'classifier.bin') -input_file = path.join(current_dir, 'dbpedia.train') -pred_file = path.join(current_dir, 'classifier_pred_test.txt') -output = path.join(current_dir, 'generated_classifier') -test_result = path.join(current_dir, 'classifier_test_result.txt') -pred_result = path.join(current_dir, 'classifier_pred_result.txt') -pred_k_result = path.join(current_dir, 'classifier_pred_k_result.txt') -test_file = path.join(current_dir, 'classifier_test.txt') +import default_params + +test_dir = path.dirname(__file__) +classifier_bin = path.join(test_dir, 'classifier.bin') +input_file = path.join(test_dir, 'dbpedia.train') +pred_file = path.join(test_dir, 'classifier_pred_test.txt') +output = path.join(test_dir, 'generated_classifier') +test_result = path.join(test_dir, 'classifier_test_result.txt') +pred_result = path.join(test_dir, 'classifier_pred_result.txt') +pred_k_result = path.join(test_dir, 'classifier_pred_k_result.txt') +test_file = path.join(test_dir, 'classifier_test.txt') +params_txt = path.join(test_dir, 'classifier_default_params_result.txt') # To validate model are loaded correctly def read_labels_from_input(filename, label_prefix): @@ -97,6 +100,33 @@ def test_load_classifier_model(self): # Make sure labels are loaded correctly self.assertTrue(sorted(model.labels) == sorted(labels)) + def test_load_invalid_classifier_model(self): + # Make sure we are throwing an exception + with self.assertRaises(ValueError): + ft.load_model('/path/to/invalid', label_prefix='__label__') + + def test_train_classifier_model_default(self): + default_args = default_params.read_file(params_txt) + model = ft.supervised(input_file, output) + + # Make sure the default params of supervised is equal + # to fasttext(1) default params + self.assertEqual(model.model_name, 'supervised') + self.assertEqual(model.dim, int(default_args['dim'])) + self.assertEqual(model.ws, int(default_args['ws'])) + self.assertEqual(model.epoch, int(default_args['epoch'])) + self.assertEqual(model.min_count, int(default_args['minCount'])) + self.assertEqual(model.neg, int(default_args['neg'])) + self.assertEqual(model.word_ngrams, int(default_args['wordNgrams'])) + self.assertEqual(model.loss_name, default_args['loss']) + self.assertEqual(model.bucket, int(default_args['bucket'])) + self.assertEqual(model.minn, int(default_args['minn'])) + self.assertEqual(model.maxn, int(default_args['maxn'])) + self.assertEqual(model.lr_update_rate, + float(default_args['lrUpdateRate'])) + self.assertEqual(model.t, float(default_args['t'])) + self.assertEqual(model.label_prefix, default_args['label']) + def test_train_classifier(self): # set params dim=10 diff --git a/test/default_params.py b/test/default_params.py new file mode 100644 index 0000000..70a481f --- /dev/null +++ b/test/default_params.py @@ -0,0 +1,17 @@ +# Set encoding to support Python 2 +# -*- coding: utf-8 -*- + +def read_file(filename): + default_args = {} + with open(filename, 'r') as f: + for line in f: + try: + line = line.decode('utf-8') + except: + line = line + + raw = line.split(' ') + key = raw[0] + value = raw[1].strip() + default_args[key] = value + return default_args diff --git a/test/default_params_test.cc b/test/default_params_test.cc index aeac0d4..fb6e19a 100644 --- a/test/default_params_test.cc +++ b/test/default_params_test.cc @@ -6,6 +6,7 @@ int main(int argc, char **argv) { Args args; + args.parseArgs(argc, argv); std::cout << "lr " << args.lr << std::endl; std::cout << "dim " << args.dim << std::endl; std::cout << "ws " << args.ws << std::endl; diff --git a/test/default_params_test.py b/test/default_params_test.py deleted file mode 100644 index 17c2f81..0000000 --- a/test/default_params_test.py +++ /dev/null @@ -1,78 +0,0 @@ -# Set encoding to support Python 2 -# -*- coding: utf-8 -*- - -import unittest -from os import path - -import fasttext as ft - -current_dir = path.dirname(__file__) -params_txt = path.join(current_dir, 'default_params_result.txt') - -# Test to make sure that default params is equivalent as fastetxt(1) -class TestDefaultParams(unittest.TestCase): - def test_default_params(self): - default_args = {} - with open(params_txt, 'r') as f: - for line in f: - try: - line = line.decode('utf-8') - except: - line = line - - raw = line.split(' ') - key = raw[0] - value = raw[1].strip() - default_args[key] = value - - # Make sure the default value of learning rate is correct - self.assertEqual(ft.default_args['lr'], float(default_args['lr'])) - - # Make sure the default value of the dimension is correct - self.assertEqual(ft.default_args['dim'], int(default_args['dim'])) - - # Make sure the default value of ws is correct - self.assertEqual(ft.default_args['ws'], int(default_args['ws'])) - - # Make sure the default value of epoch is correct - self.assertEqual(ft.default_args['epoch'], int(default_args['epoch'])) - - # Make sure the default value of minCount is correct - self.assertEqual(ft.default_args['minCount'], - int(default_args['minCount'])) - - # Make sure the default value of neg is correct - self.assertEqual(ft.default_args['neg'], int(default_args['neg'])) - - # Make sure the default value of wordNgrams is correct - self.assertEqual(ft.default_args['wordNgrams'], - int(default_args['wordNgrams'])) - - # Make sure the default value of loss is correct - self.assertEqual(ft.default_args['loss'], default_args['loss']) - - # Make sure the default value of bucket is correct - self.assertEqual(ft.default_args['bucket'], - int(default_args['bucket'])) - - # Make sure the default value of minn is correct - self.assertEqual(ft.default_args['minn'], int(default_args['minn'])) - - # Make sure the default value of maxn is correct - self.assertEqual(ft.default_args['maxn'], int(default_args['maxn'])) - - # Make sure the default value of thread is correct - self.assertEqual(ft.default_args['thread'], int(default_args['thread'])) - - # Make sure the default value of lrUpdateRate is correct - self.assertEqual(ft.default_args['lrUpdateRate'], - float(default_args['lrUpdateRate'])) - - # Make sure the default value of t is correct - self.assertEqual(ft.default_args['t'], float(default_args['t'])) - - # Make sure the default value of label is correct - self.assertEqual(ft.default_args['label'], default_args['label']) - -if __name__ == '__main__': - unittest.main() diff --git a/test/skipgram_test.py b/test/skipgram_test.py index 1459e94..2c8a30e 100644 --- a/test/skipgram_test.py +++ b/test/skipgram_test.py @@ -9,9 +9,13 @@ import fasttext as ft -skipgram_file = path.join(path.dirname(__file__), 'skipgram_params_test.bin') -input_file = path.join(path.dirname(__file__), 'params_test.txt') -output = path.join(path.dirname(__file__), 'generated_skipgram') +import default_params + +test_dir = path.dirname(__file__) +skipgram_file = path.join(test_dir, 'skipgram_params_test.bin') +input_file = path.join(test_dir, 'params_test.txt') +output = path.join(test_dir, 'generated_skipgram') +params_txt = path.join(test_dir, 'skipgram_default_params_result.txt') # Test to make sure that skipgram interface run correctly class TestSkipgramModel(unittest.TestCase): @@ -43,6 +47,32 @@ def test_load_skipgram_model(self): self.assertTrue(unicode_str in model.words) self.assertEqual(len(model[unicode_str]), model.dim) + def test_load_invalid_skipgram_model(self): + # Make sure we are throwing an exception + with self.assertRaises(ValueError): + ft.load_model('/path/to/invalid') + + def test_train_skipgram_model_default(self): + default_args = default_params.read_file(params_txt) + model = ft.skipgram(input_file, output) + + # Make sure the default params of skipgram is equal + # to fasttext(1) default params + self.assertEqual(model.model_name, 'skipgram') + self.assertEqual(model.dim, int(default_args['dim'])) + self.assertEqual(model.ws, int(default_args['ws'])) + self.assertEqual(model.epoch, int(default_args['epoch'])) + self.assertEqual(model.min_count, int(default_args['minCount'])) + self.assertEqual(model.neg, int(default_args['neg'])) + self.assertEqual(model.word_ngrams, int(default_args['wordNgrams'])) + self.assertEqual(model.loss_name, default_args['loss']) + self.assertEqual(model.bucket, int(default_args['bucket'])) + self.assertEqual(model.minn, int(default_args['minn'])) + self.assertEqual(model.maxn, int(default_args['maxn'])) + self.assertEqual(model.lr_update_rate, + float(default_args['lrUpdateRate'])) + self.assertEqual(model.t, float(default_args['t'])) + def test_train_skipgram_model(self): # set params lr=0.005 From e41dcd42006ce1c4aeec927b0cb736d62479ca39 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 29 Aug 2016 14:22:37 +0700 Subject: [PATCH 072/109] Ignore test/*_result.txt --- .gitignore | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 8df159a..14a024c 100644 --- a/.gitignore +++ b/.gitignore @@ -22,12 +22,9 @@ facebookresearch-fasttext-* .eggs/ # For test -test/classifier_pred_result.txt -test/classifier_test_result.txt +test/*_result.txt test/dbpedia.train -test/classifier_pred_k_result.txt test/dbpedia_csv/ -test/default_params_result.txt test/default_params_test # Misc From 345f146e8e26421eac6033aa5b09bbc053fd415c Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 29 Aug 2016 14:23:36 +0700 Subject: [PATCH 073/109] Set fasttext.__VERSION__ --- fasttext/__init__.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fasttext/__init__.py b/fasttext/__init__.py index 6af7367..9b965b3 100644 --- a/fasttext/__init__.py +++ b/fasttext/__init__.py @@ -3,7 +3,13 @@ from .fasttext import load_model from .fasttext import supervised -@property -def __VERSION__(): - with open('VERSION') as f: +import os + +dir_path = os.path.dirname(os.path.realpath(__file__)) +version_path = os.path.join(dir_path, 'VERSION') + +def _read_version(): + with open(version_path) as f: return f.read().strip() + +__VERSION__ = _read_version() From 4084284fb4031ce7e70cb8d9fafc59f75a47119f Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 29 Aug 2016 14:36:33 +0700 Subject: [PATCH 074/109] Update default params docs --- README.md | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ README.rst | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) diff --git a/README.md b/README.md index 0ab7b29..b889217 100644 --- a/README.md +++ b/README.md @@ -174,6 +174,35 @@ Train & load skipgram model model = fasttext.skipgram(params) ``` +List of available `params` and their default value: + +``` +input training file path (required) +output output file path (required) +lr learning rate [0.05] +lr_update_rate change the rate of updates for the learning rate [100] +dim size of word vectors [100] +ws size of the context window [5] +epoch number of epochs [5] +min_count minimal number of word occurences [5] +neg number of negatives sampled [5] +word_ngrams max length of word ngram [1] +loss loss function {ns, hs, softmax} [ns] +bucket number of buckets [2000000] +minn min length of char ngram [3] +maxn max length of char ngram [6] +thread number of threads [12] +t sampling threshold [0.0001] +silent disable the log output from the C++ extension [1] + +``` + +Example usage: + +```python +model = fasttext.skipgram('train.txt', 'model', lr=0.1, dim=300) +``` + ### CBOW model Train & load CBOW model @@ -182,6 +211,35 @@ Train & load CBOW model model = fasttext.cbow(params) ``` +List of available `params` and their default value: + +``` +input training file path (required) +output output file path (required) +lr learning rate [0.05] +lr_update_rate change the rate of updates for the learning rate [100] +dim size of word vectors [100] +ws size of the context window [5] +epoch number of epochs [5] +min_count minimal number of word occurences [5] +neg number of negatives sampled [5] +word_ngrams max length of word ngram [1] +loss loss function {ns, hs, softmax} [ns] +bucket number of buckets [2000000] +minn min length of char ngram [3] +maxn max length of char ngram [6] +thread number of threads [12] +t sampling threshold [0.0001] +silent disable the log output from the C++ extension [1] + +``` + +Example usage: + +```python +model = fasttext.cbow('train.txt', 'model', lr=0.1, dim=300) +``` + ### Load pre-trained model File `.bin` that previously trained or generated by fastText can be @@ -221,6 +279,36 @@ Train & load the classifier classifier = fasttext.supervised(params) ``` +List of available `params` and their default value: + +``` +input training file path (required) +output output file path (required) +label_prefix label prefix ['__label__'] +lr learning rate [0.05] +lr_update_rate change the rate of updates for the learning rate [100] +dim size of word vectors [100] +ws size of the context window [5] +epoch number of epochs [5] +min_count minimal number of word occurences [1] +neg number of negatives sampled [5] +word_ngrams max length of word ngram [1] +loss loss function {ns, hs, softmax} [softmax] +bucket number of buckets [2000000] +minn min length of char ngram [3] +maxn max length of char ngram [6] +thread number of threads [12] +t sampling threshold [0.0001] +silent disable the log output from the C++ extension [1] + +``` + +Example usage: + +```python +classifier = fasttext.supervised('train.txt', 'model', label_prefix='__myprefix__', + thread=4) +``` ### Load pre-trained classifier diff --git a/README.rst b/README.rst index 1129921..9aac5e9 100644 --- a/README.rst +++ b/README.rst @@ -186,6 +186,34 @@ Train & load skipgram model model = fasttext.skipgram(params) +List of available ``params`` and their default value: + +:: + + input training file path (required) + output output file path (required) + lr learning rate [0.05] + lr_update_rate change the rate of updates for the learning rate [100] + dim size of word vectors [100] + ws size of the context window [5] + epoch number of epochs [5] + min_count minimal number of word occurences [5] + neg number of negatives sampled [5] + word_ngrams max length of word ngram [1] + loss loss function {ns, hs, softmax} [ns] + bucket number of buckets [2000000] + minn min length of char ngram [3] + maxn max length of char ngram [6] + thread number of threads [12] + t sampling threshold [0.0001] + silent disable the log output from the C++ extension [1] + +Example usage: + +.. code:: python + + model = fasttext.skipgram('train.txt', 'model', lr=0.1, dim=300) + CBOW model ~~~~~~~~~~ @@ -195,6 +223,34 @@ Train & load CBOW model model = fasttext.cbow(params) +List of available ``params`` and their default value: + +:: + + input training file path (required) + output output file path (required) + lr learning rate [0.05] + lr_update_rate change the rate of updates for the learning rate [100] + dim size of word vectors [100] + ws size of the context window [5] + epoch number of epochs [5] + min_count minimal number of word occurences [5] + neg number of negatives sampled [5] + word_ngrams max length of word ngram [1] + loss loss function {ns, hs, softmax} [ns] + bucket number of buckets [2000000] + minn min length of char ngram [3] + maxn max length of char ngram [6] + thread number of threads [12] + t sampling threshold [0.0001] + silent disable the log output from the C++ extension [1] + +Example usage: + +.. code:: python + + model = fasttext.cbow('train.txt', 'model', lr=0.1, dim=300) + Load pre-trained model ~~~~~~~~~~~~~~~~~~~~~~ @@ -237,6 +293,36 @@ Train & load the classifier classifier = fasttext.supervised(params) +List of available ``params`` and their default value: + +:: + + input training file path (required) + output output file path (required) + label_prefix label prefix ['__label__'] + lr learning rate [0.05] + lr_update_rate change the rate of updates for the learning rate [100] + dim size of word vectors [100] + ws size of the context window [5] + epoch number of epochs [5] + min_count minimal number of word occurences [1] + neg number of negatives sampled [5] + word_ngrams max length of word ngram [1] + loss loss function {ns, hs, softmax} [softmax] + bucket number of buckets [2000000] + minn min length of char ngram [3] + maxn max length of char ngram [6] + thread number of threads [12] + t sampling threshold [0.0001] + silent disable the log output from the C++ extension [1] + +Example usage: + +.. code:: python + + classifier = fasttext.supervised('train.txt', 'model', label_prefix='__myprefix__', + thread=4) + Load pre-trained classifier ~~~~~~~~~~~~~~~~~~~~~~~~~~~ From a367565775249a0065269d16a7a4dc3b6c0c3af9 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 29 Aug 2016 14:57:32 +0700 Subject: [PATCH 075/109] Add fasttext/VERSION as data_files in setup.py --- MANIFEST.in | 1 - setup.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index f062fd5..6dc8d42 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,6 +2,5 @@ global-include *.pyx global-include *.pxd global-include *.cc global-include *.h -global-include fasttext/VERSION prune facebookresearch-fastText-* diff --git a/setup.py b/setup.py index d4a25a0..237d0b7 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,7 @@ def read_version(): long_description=open('README.rst', 'r').read(), license='BSD 3-Clause License', packages=['fasttext'], + data_files=[('fasttext', ['fasttext/VERSION'])], ext_modules = cythonize(extensions), install_requires=[ 'numpy>=1', From 9cda45e95d44782c5915d44625ffaa5db6bd51b5 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 29 Aug 2016 15:23:32 +0700 Subject: [PATCH 076/109] Update self.assertRaises to support Python 2.6 --- test/cbow_test.py | 4 ++-- test/classifier_test.py | 4 ++-- test/skipgram_test.py | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/test/cbow_test.py b/test/cbow_test.py index de1d107..8838829 100644 --- a/test/cbow_test.py +++ b/test/cbow_test.py @@ -49,8 +49,8 @@ def test_load_cbow_model(self): def test_load_invalid_cbow_model(self): # Make sure we are throwing an exception - with self.assertRaises(ValueError): - ft.load_model('/path/to/invalid') + self.assertRaises(ValueError, ft.load_model, + '/path/to/invalid') def test_train_cbow_model_default(self): default_args = default_params.read_file(params_txt) diff --git a/test/classifier_test.py b/test/classifier_test.py index 6acf391..44a26c5 100644 --- a/test/classifier_test.py +++ b/test/classifier_test.py @@ -102,8 +102,8 @@ def test_load_classifier_model(self): def test_load_invalid_classifier_model(self): # Make sure we are throwing an exception - with self.assertRaises(ValueError): - ft.load_model('/path/to/invalid', label_prefix='__label__') + self.assertRaises(ValueError, ft.load_model, '/path/to/invalid', + label_prefix='__label__') def test_train_classifier_model_default(self): default_args = default_params.read_file(params_txt) diff --git a/test/skipgram_test.py b/test/skipgram_test.py index 2c8a30e..1d53335 100644 --- a/test/skipgram_test.py +++ b/test/skipgram_test.py @@ -49,8 +49,7 @@ def test_load_skipgram_model(self): def test_load_invalid_skipgram_model(self): # Make sure we are throwing an exception - with self.assertRaises(ValueError): - ft.load_model('/path/to/invalid') + self.assertRaises(ValueError, ft.load_model, '/path/to/invalid') def test_train_skipgram_model_default(self): default_args = default_params.read_file(params_txt) From 68a5b270ee7fa9308eb882596094b01c3f33bd9c Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 29 Aug 2016 15:46:00 +0700 Subject: [PATCH 077/109] Update docs --- README.md | 27 --------------------------- README.rst | 29 ----------------------------- 2 files changed, 56 deletions(-) diff --git a/README.md b/README.md index b889217..987249c 100644 --- a/README.md +++ b/README.md @@ -380,33 +380,6 @@ classifier.predict(texts, k) # Predict the most likely label The param `k` for `classifier.test` and `classifier.predict` is optional, and equal to `1` by default. -### Params -List of available `params` and their default value: - -``` -For Skipgram, CBOW and Supervised model -input training file path -output output file path -lr learning rate [0.05] -lr_update_rate change the rate of updates for the learning rate [100] -dim size of word vectors [100] -ws size of the context window [5] -epoch number of epochs [5] -min_count minimal number of word occurences [1] -neg number of negatives sampled [5] -word_ngrams max length of word ngram [1] -loss loss function {ns, hs, softmax} [ns] -bucket number of buckets [2000000] -minn min length of char ngram [3] -maxn max length of char ngram [6] -thread number of threads [12] -t sampling threshold [0.0001] -silent disable the log output from the C++ extension [1] - -For Supervised model only -label_prefix Prefix of the label name [__label__] -``` - ## References ### Enriching Word Vectors with Subword Information diff --git a/README.rst b/README.rst index 9aac5e9..478e89a 100644 --- a/README.rst +++ b/README.rst @@ -395,35 +395,6 @@ Classifier have the following atributes & methods The param ``k`` for ``classifier.test`` and ``classifier.predict`` is optional, and equal to ``1`` by default. -Params -~~~~~~ - -List of available ``params`` and their default value: - -:: - - For Skipgram, CBOW and Supervised model - input training file path - output output file path - lr learning rate [0.05] - lr_update_rate change the rate of updates for the learning rate [100] - dim size of word vectors [100] - ws size of the context window [5] - epoch number of epochs [5] - min_count minimal number of word occurences [1] - neg number of negatives sampled [5] - word_ngrams max length of word ngram [1] - loss loss function {ns, hs, softmax} [ns] - bucket number of buckets [2000000] - minn min length of char ngram [3] - maxn max length of char ngram [6] - thread number of threads [12] - t sampling threshold [0.0001] - silent disable the log output from the C++ extension [1] - - For Supervised model only - label_prefix Prefix of the label name [__label__] - References ---------- From 82cdb66a4ee22d0da47fb8f419a2170ef902f84b Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 29 Aug 2016 15:50:57 +0700 Subject: [PATCH 078/109] Build PR and master branch only --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index cc3887a..e13fa0e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,6 +8,8 @@ python: - '3.5' install: make install script: make test +branches: + only: [master] notifications: slack: secure: pStiYmzBbnb0W18r1i1Lz8FIakVHajsv3on1vWy8fNWVuPfaJ85ZqJhnmrr2HKlanZcl6bEMnllDctzt/F+u4HfeXHmhS7a9nZbRDDGyWIxsvJA/UqPt2byLEB1u+KbLb53eDu7MTIe63tzk1zq+4BTupI+btc4igiUuzAhqh4+LP9eZe2L58aC+jOzIn/9Kno7+xawhj2DKs6m3O/hcXFORcOpdtWRFpoDa66dN7xPVbN0hYD80uVApEpghnHToiJN0HhhB92YmZHa1ByWj7u9VN1Eaex1srGQOJQG3FaDBJY1r2e9c7Sj+33gkZb1AqjeOpxhRsxxVUdigDvCoxIrr6ll0/p3n6pUfRGQ7SB1A7NoRBC+g6aTJbOLr5NjQDBmZHaFXx/QFd1h0EUfgBybDI3v4cKOtV8vIFoT1xdkGs/Hjo4v9z4KO6R135uDBwaJAo9cWx360xV1UK1cb4kfzdbJFk4mNmMEbdwJHT27a7e3uWr1lu6CrMUzVk0EXj1BroKC7jcRK7qthr9DcfW2mmGG3JTIKQ6+nYSEF0KC/JjjbIsg/2hKtq7mACzrHrluN6HbqCF6Kd2n2rfItsqIaCo6LEmgZ2fo69R34i96QzyHpplBivWOgC+pwLOe0FiseuleSCZ/kQgJPf62gsqCan6+GkazoEp9Ow+lPMkA= From 1ddb64318e209433e81abd74a2f30ac1c6927012 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Mon, 29 Aug 2016 16:22:01 +0700 Subject: [PATCH 079/109] Add target to upload and install from pypitest server --- Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile b/Makefile index 21803ba..6983717 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,14 @@ README.rst: README.md upload: README.rst python setup.py sdist upload +upload-to-pypitest: README.rst + python setup.py sdist upload -r pypitest +.PHONY: upload-to-pypitest + +install-from-pypitest:: + pip install -U --no-cache-dir -i https://testpypi.python.org/pypi fasttext +.PHONY: install-from-pypitest + install-dev: README.rst python setup.py develop .PHONY: install-dev From c9c04b382e66f97e70ab08228316f9ec3bb4f494 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Tue, 30 Aug 2016 11:29:24 +0700 Subject: [PATCH 080/109] Build & test the release tags --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e13fa0e..cc45638 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,9 @@ python: install: make install script: make test branches: - only: [master] + only: + - master + - /^v\d{,2}\.\d{,3}\.\d{,3}$/ notifications: slack: secure: pStiYmzBbnb0W18r1i1Lz8FIakVHajsv3on1vWy8fNWVuPfaJ85ZqJhnmrr2HKlanZcl6bEMnllDctzt/F+u4HfeXHmhS7a9nZbRDDGyWIxsvJA/UqPt2byLEB1u+KbLb53eDu7MTIe63tzk1zq+4BTupI+btc4igiUuzAhqh4+LP9eZe2L58aC+jOzIn/9Kno7+xawhj2DKs6m3O/hcXFORcOpdtWRFpoDa66dN7xPVbN0hYD80uVApEpghnHToiJN0HhhB92YmZHa1ByWj7u9VN1Eaex1srGQOJQG3FaDBJY1r2e9c7Sj+33gkZb1AqjeOpxhRsxxVUdigDvCoxIrr6ll0/p3n6pUfRGQ7SB1A7NoRBC+g6aTJbOLr5NjQDBmZHaFXx/QFd1h0EUfgBybDI3v4cKOtV8vIFoT1xdkGs/Hjo4v9z4KO6R135uDBwaJAo9cWx360xV1UK1cb4kfzdbJFk4mNmMEbdwJHT27a7e3uWr1lu6CrMUzVk0EXj1BroKC7jcRK7qthr9DcfW2mmGG3JTIKQ6+nYSEF0KC/JjjbIsg/2hKtq7mACzrHrluN6HbqCF6Kd2n2rfItsqIaCo6LEmgZ2fo69R34i96QzyHpplBivWOgC+pwLOe0FiseuleSCZ/kQgJPf62gsqCan6+GkazoEp9Ow+lPMkA= From 25ad0d4b42ba76635a2e0e15d0007d196e672e74 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Wed, 31 Aug 2016 17:30:11 +0700 Subject: [PATCH 081/109] Update fastText from 3223526 to fabb04e --- fasttext/cpp/LAST_COMMIT | 2 +- fasttext/cpp/src/args.cc | 67 +++---- fasttext/cpp/src/args.h | 6 +- fasttext/cpp/src/dictionary.cc | 85 +++++---- fasttext/cpp/src/dictionary.h | 18 +- fasttext/cpp/src/fasttext.cc | 318 +++++++++++++++------------------ fasttext/cpp/src/fasttext.h | 52 ++++++ fasttext/cpp/src/matrix.cc | 16 +- fasttext/cpp/src/matrix.h | 7 +- fasttext/cpp/src/model.cc | 119 ++++++------ fasttext/cpp/src/model.h | 30 ++-- fasttext/cpp/src/utils.cc | 7 +- fasttext/cpp/src/vector.cc | 1 - fasttext/cpp/src/vector.h | 2 +- update-fasttext.sh | 2 +- 15 files changed, 387 insertions(+), 345 deletions(-) create mode 100644 fasttext/cpp/src/fasttext.h diff --git a/fasttext/cpp/LAST_COMMIT b/fasttext/cpp/LAST_COMMIT index 6a458e6..aec85aa 100644 --- a/fasttext/cpp/LAST_COMMIT +++ b/fasttext/cpp/LAST_COMMIT @@ -1 +1 @@ -3223526 +fabb04e diff --git a/fasttext/cpp/src/args.cc b/fasttext/cpp/src/args.cc index 3169db5..55287d0 100644 --- a/fasttext/cpp/src/args.cc +++ b/fasttext/cpp/src/args.cc @@ -13,7 +13,6 @@ #include #include -#include Args::Args() { lr = 0.05; @@ -40,6 +39,9 @@ void Args::parseArgs(int argc, char** argv) { model = model_name::sup; loss = loss_name::softmax; minCount = 1; + minn = 0; + maxn = 0; + lr = 0.1; } else if (command == "cbow") { model = model_name::cbow; } @@ -112,6 +114,9 @@ void Args::parseArgs(int argc, char** argv) { printHelp(); exit(EXIT_FAILURE); } + if (wordNgrams <= 1 && maxn == 0) { + bucket = 0; + } } void Args::printHelp() { @@ -139,38 +144,34 @@ void Args::printHelp() { << std::endl; } -void Args::save(std::ofstream& ofs) { - if (ofs.is_open()) { - ofs.write((char*) &(dim), sizeof(int)); - ofs.write((char*) &(ws), sizeof(int)); - ofs.write((char*) &(epoch), sizeof(int)); - ofs.write((char*) &(minCount), sizeof(int)); - ofs.write((char*) &(neg), sizeof(int)); - ofs.write((char*) &(wordNgrams), sizeof(int)); - ofs.write((char*) &(loss), sizeof(loss_name)); - ofs.write((char*) &(model), sizeof(model_name)); - ofs.write((char*) &(bucket), sizeof(int)); - ofs.write((char*) &(minn), sizeof(int)); - ofs.write((char*) &(maxn), sizeof(int)); - ofs.write((char*) &(lrUpdateRate), sizeof(int)); - ofs.write((char*) &(t), sizeof(double)); - } +void Args::save(std::ostream& out) { + out.write((char*) &(dim), sizeof(int)); + out.write((char*) &(ws), sizeof(int)); + out.write((char*) &(epoch), sizeof(int)); + out.write((char*) &(minCount), sizeof(int)); + out.write((char*) &(neg), sizeof(int)); + out.write((char*) &(wordNgrams), sizeof(int)); + out.write((char*) &(loss), sizeof(loss_name)); + out.write((char*) &(model), sizeof(model_name)); + out.write((char*) &(bucket), sizeof(int)); + out.write((char*) &(minn), sizeof(int)); + out.write((char*) &(maxn), sizeof(int)); + out.write((char*) &(lrUpdateRate), sizeof(int)); + out.write((char*) &(t), sizeof(double)); } -void Args::load(std::ifstream& ifs) { - if (ifs.is_open()) { - ifs.read((char*) &(dim), sizeof(int)); - ifs.read((char*) &(ws), sizeof(int)); - ifs.read((char*) &(epoch), sizeof(int)); - ifs.read((char*) &(minCount), sizeof(int)); - ifs.read((char*) &(neg), sizeof(int)); - ifs.read((char*) &(wordNgrams), sizeof(int)); - ifs.read((char*) &(loss), sizeof(loss_name)); - ifs.read((char*) &(model), sizeof(model_name)); - ifs.read((char*) &(bucket), sizeof(int)); - ifs.read((char*) &(minn), sizeof(int)); - ifs.read((char*) &(maxn), sizeof(int)); - ifs.read((char*) &(lrUpdateRate), sizeof(int)); - ifs.read((char*) &(t), sizeof(double)); - } +void Args::load(std::istream& in) { + in.read((char*) &(dim), sizeof(int)); + in.read((char*) &(ws), sizeof(int)); + in.read((char*) &(epoch), sizeof(int)); + in.read((char*) &(minCount), sizeof(int)); + in.read((char*) &(neg), sizeof(int)); + in.read((char*) &(wordNgrams), sizeof(int)); + in.read((char*) &(loss), sizeof(loss_name)); + in.read((char*) &(model), sizeof(model_name)); + in.read((char*) &(bucket), sizeof(int)); + in.read((char*) &(minn), sizeof(int)); + in.read((char*) &(maxn), sizeof(int)); + in.read((char*) &(lrUpdateRate), sizeof(int)); + in.read((char*) &(t), sizeof(double)); } diff --git a/fasttext/cpp/src/args.h b/fasttext/cpp/src/args.h index 69eeb28..0efe6b7 100644 --- a/fasttext/cpp/src/args.h +++ b/fasttext/cpp/src/args.h @@ -10,6 +10,8 @@ #ifndef FASTTEXT_ARGS_H #define FASTTEXT_ARGS_H +#include +#include #include enum class model_name : int {cbow=1, sg, sup}; @@ -40,8 +42,8 @@ class Args { void parseArgs(int, char**); void printHelp(); - void save(std::ofstream&); - void load(std::ifstream&); + void save(std::ostream&); + void load(std::istream&); }; #endif diff --git a/fasttext/cpp/src/dictionary.cc b/fasttext/cpp/src/dictionary.cc index 19a80b7..4231375 100644 --- a/fasttext/cpp/src/dictionary.cc +++ b/fasttext/cpp/src/dictionary.cc @@ -17,15 +17,12 @@ #include #include -#include "args.h" - -extern Args args; - const std::string Dictionary::EOS = ""; const std::string Dictionary::BOW = "<"; const std::string Dictionary::EOW = ">"; -Dictionary::Dictionary() { +Dictionary::Dictionary(std::shared_ptr args) { + args_ = args; size_ = 0; nwords_ = 0; nlabels_ = 0; @@ -51,7 +48,7 @@ void Dictionary::add(const std::string& w) { entry e; e.word = w; e.count = 1; - e.type = (w.find(args.label) == 0) ? entry_type::label : entry_type::word; + e.type = (w.find(args_->label) == 0) ? entry_type::label : entry_type::word; words_.push_back(e); word2int_[h] = size_++; } else { @@ -91,7 +88,7 @@ const std::vector Dictionary::getNgrams(const std::string& word) { bool Dictionary::discard(int32_t id, real rand) { assert(id >= 0); assert(id < nwords_); - if (args.model == model_name::sup) return false; + if (args_->model == model_name::sup) return false; return rand > pdiscard_[id]; } @@ -126,13 +123,13 @@ void Dictionary::computeNgrams(const std::string& word, for (size_t i = 0; i < word.size(); i++) { std::string ngram; if ((word[i] & 0xC0) == 0x80) continue; - for (size_t j = i, n = 1; j < word.size() && n <= args.maxn; n++) { + for (size_t j = i, n = 1; j < word.size() && n <= args_->maxn; n++) { ngram.push_back(word[j++]); while (j < word.size() && (word[j] & 0xC0) == 0x80) { ngram.push_back(word[j++]); } - if (n >= args.minn) { - int32_t h = hash(ngram) % args.bucket; + if (n >= args_->minn) { + int32_t h = hash(ngram) % args_->bucket; ngrams.push_back(nwords_ + h); } } @@ -147,12 +144,12 @@ void Dictionary::initNgrams() { } } -bool Dictionary::readWord(std::ifstream& fin, std::string& word) +bool Dictionary::readWord(std::istream& in, std::string& word) { char c; word.clear(); - while (fin.peek() != EOF) { - fin.get(c); + while (in.peek() != EOF) { + in.get(c); if (isspace(c) || c == 0) { if (word.empty()) { if (c == '\n') { @@ -161,7 +158,7 @@ bool Dictionary::readWord(std::ifstream& fin, std::string& word) } continue; } else { - if (c == '\n') fin.unget(); + if (c == '\n') in.unget(); return true; } } @@ -170,10 +167,10 @@ bool Dictionary::readWord(std::ifstream& fin, std::string& word) return !word.empty(); } -void Dictionary::readFromFile(std::ifstream& ifs) { +void Dictionary::readFromFile(std::istream& in) { std::string word; int64_t minThreshold = 1; - while (readWord(ifs, word)) { + while (readWord(in, word)) { add(word); if (ntokens_ % 1000000 == 0) { std::cout << "\rRead " << ntokens_ / 1000000 << "M words" << std::flush; @@ -183,9 +180,11 @@ void Dictionary::readFromFile(std::ifstream& ifs) { } } std::cout << "\rRead " << ntokens_ / 1000000 << "M words" << std::endl; - threshold(args.minCount); + threshold(args_->minCount); initTableDiscard(); initNgrams(); + std::cout << "Number of words: " << nwords_ << std::endl; + std::cout << "Number of labels: " << nlabels_ << std::endl; } void Dictionary::threshold(int64_t t) { @@ -194,7 +193,7 @@ void Dictionary::threshold(int64_t t) { return e1.count > e2.count; }); words_.erase(remove_if(words_.begin(), words_.end(), [&](const entry& e) { - return e.count < t; + return e.type == entry_type::word && e.count < t; }), words_.end()); words_.shrink_to_fit(); size_ = 0; @@ -215,7 +214,7 @@ void Dictionary::initTableDiscard() { pdiscard_.resize(size_); for (size_t i = 0; i < size_; i++) { real f = real(words_[i].count) / real(ntokens_); - pdiscard_[i] = sqrt(args.t / f) + args.t / f; + pdiscard_[i] = sqrt(args_->t / f) + args_->t / f; } } @@ -233,12 +232,12 @@ void Dictionary::addNgrams(std::vector& line, int32_t n) { uint64_t h = line[i]; for (int32_t j = i + 1; j < line_size && j < i + n; j++) { h = h * 116049371 + line[j]; - line.push_back(nwords_ + (h % args.bucket)); + line.push_back(nwords_ + (h % args_->bucket)); } } } -int32_t Dictionary::getLine(std::ifstream& ifs, +int32_t Dictionary::getLine(std::istream& in, std::vector& words, std::vector& labels, std::minstd_rand& rng) { @@ -247,11 +246,11 @@ int32_t Dictionary::getLine(std::ifstream& ifs, int32_t ntokens = 0; words.clear(); labels.clear(); - if (ifs.eof()) { - ifs.clear(); - ifs.seekg(std::streampos(0)); + if (in.eof()) { + in.clear(); + in.seekg(std::streampos(0)); } - while (readWord(ifs, token)) { + while (readWord(in, token)) { if (token == EOS) break; int32_t wid = getId(token); if (wid < 0) continue; @@ -263,7 +262,7 @@ int32_t Dictionary::getLine(std::ifstream& ifs, if (type == entry_type::label) { labels.push_back(wid - nwords_); } - if (words.size() > MAX_LINE_SIZE && args.model != model_name::sup) break; + if (words.size() > MAX_LINE_SIZE && args_->model != model_name::sup) break; } return ntokens; } @@ -274,37 +273,37 @@ std::string Dictionary::getLabel(int32_t lid) { return words_[lid + nwords_].word; } -void Dictionary::save(std::ofstream& ofs) { - ofs.write((char*) &size_, sizeof(int32_t)); - ofs.write((char*) &nwords_, sizeof(int32_t)); - ofs.write((char*) &nlabels_, sizeof(int32_t)); - ofs.write((char*) &ntokens_, sizeof(int64_t)); +void Dictionary::save(std::ostream& out) { + out.write((char*) &size_, sizeof(int32_t)); + out.write((char*) &nwords_, sizeof(int32_t)); + out.write((char*) &nlabels_, sizeof(int32_t)); + out.write((char*) &ntokens_, sizeof(int64_t)); for (int32_t i = 0; i < size_; i++) { entry e = words_[i]; - ofs.write(e.word.data(), e.word.size() * sizeof(char)); - ofs.put(0); - ofs.write((char*) &(e.count), sizeof(int64_t)); - ofs.write((char*) &(e.type), sizeof(entry_type)); + out.write(e.word.data(), e.word.size() * sizeof(char)); + out.put(0); + out.write((char*) &(e.count), sizeof(int64_t)); + out.write((char*) &(e.type), sizeof(entry_type)); } } -void Dictionary::load(std::ifstream& ifs) { +void Dictionary::load(std::istream& in) { words_.clear(); for (int32_t i = 0; i < MAX_VOCAB_SIZE; i++) { word2int_[i] = -1; } - ifs.read((char*) &size_, sizeof(int32_t)); - ifs.read((char*) &nwords_, sizeof(int32_t)); - ifs.read((char*) &nlabels_, sizeof(int32_t)); - ifs.read((char*) &ntokens_, sizeof(int64_t)); + in.read((char*) &size_, sizeof(int32_t)); + in.read((char*) &nwords_, sizeof(int32_t)); + in.read((char*) &nlabels_, sizeof(int32_t)); + in.read((char*) &ntokens_, sizeof(int64_t)); for (int32_t i = 0; i < size_; i++) { char c; entry e; - while ((c = ifs.get()) != 0) { + while ((c = in.get()) != 0) { e.word.push_back(c); } - ifs.read((char*) &e.count, sizeof(int64_t)); - ifs.read((char*) &e.type, sizeof(entry_type)); + in.read((char*) &e.count, sizeof(int64_t)); + in.read((char*) &e.type, sizeof(entry_type)); words_.push_back(e); word2int_[find(e.word)] = i; } diff --git a/fasttext/cpp/src/dictionary.h b/fasttext/cpp/src/dictionary.h index 7d2fe26..70bf821 100644 --- a/fasttext/cpp/src/dictionary.h +++ b/fasttext/cpp/src/dictionary.h @@ -12,9 +12,12 @@ #include #include -#include +#include +#include #include +#include +#include "args.h" #include "real.h" typedef int32_t id_type; @@ -37,6 +40,7 @@ class Dictionary { void initNgrams(); void threshold(int64_t); + std::shared_ptr args_; std::vector word2int_; std::vector words_; std::vector pdiscard_; @@ -50,7 +54,7 @@ class Dictionary { static const std::string BOW; static const std::string EOW; - Dictionary(); + explicit Dictionary(std::shared_ptr); int32_t nwords(); int32_t nlabels(); int64_t ntokens(); @@ -63,14 +67,14 @@ class Dictionary { void computeNgrams(const std::string&, std::vector&); uint32_t hash(const std::string& str); void add(const std::string&); - bool readWord(std::ifstream&, std::string&); - void readFromFile(std::ifstream&); + bool readWord(std::istream&, std::string&); + void readFromFile(std::istream&); std::string getLabel(int32_t); - void save(std::ofstream&); - void load(std::ifstream&); + void save(std::ostream&); + void load(std::istream&); std::vector getCounts(entry_type); void addNgrams(std::vector&, int32_t); - int32_t getLine(std::ifstream&, std::vector&, + int32_t getLine(std::istream&, std::vector&, std::vector&, std::minstd_rand&); }; diff --git a/fasttext/cpp/src/fasttext.cc b/fasttext/cpp/src/fasttext.cc index 7a7cc47..dea7d79 100644 --- a/fasttext/cpp/src/fasttext.cc +++ b/fasttext/cpp/src/fasttext.cc @@ -7,6 +7,8 @@ * of patent rights can be found in the PATENTS file in the same directory. */ +#include "fasttext.h" + #include #include #include @@ -16,152 +18,137 @@ #include #include #include -#include #include -#include "matrix.h" -#include "vector.h" -#include "dictionary.h" -#include "model.h" -#include "utils.h" -#include "real.h" -#include "args.h" - -Args args; - -namespace info { - clock_t start; - std::atomic allWords(0); - std::atomic allN(0); - double allLoss(0.0); -} - -void getVector(Dictionary& dict, Matrix& input, Vector& vec, std::string word) { - const std::vector& ngrams = dict.getNgrams(word); +void FastText::getVector(Vector& vec, const std::string& word) { + const std::vector& ngrams = dict_->getNgrams(word); vec.zero(); for (auto it = ngrams.begin(); it != ngrams.end(); ++it) { - vec.addRow(input, *it); + vec.addRow(*input_, *it); } if (ngrams.size() > 0) { vec.mul(1.0 / ngrams.size()); } } -void saveVectors(Dictionary& dict, Matrix& input, Matrix& output) { - std::ofstream ofs(args.output + ".vec"); +void FastText::saveVectors() { + std::ofstream ofs(args_->output + ".vec"); if (!ofs.is_open()) { std::cout << "Error opening file for saving vectors." << std::endl; exit(EXIT_FAILURE); } - ofs << dict.nwords() << " " << args.dim << std::endl; - Vector vec(args.dim); - for (int32_t i = 0; i < dict.nwords(); i++) { - std::string word = dict.getWord(i); - getVector(dict, input, vec, word); + ofs << dict_->nwords() << " " << args_->dim << std::endl; + Vector vec(args_->dim); + for (int32_t i = 0; i < dict_->nwords(); i++) { + std::string word = dict_->getWord(i); + getVector(vec, word); ofs << word << " " << vec << std::endl; } ofs.close(); } -void printVectors(Dictionary& dict, Matrix& input) { +void FastText::printVectors() { std::string word; - Vector vec(args.dim); + Vector vec(args_->dim); while (std::cin >> word) { - getVector(dict, input, vec, word); + getVector(vec, word); std::cout << word << " " << vec << std::endl; } } -void saveModel(Dictionary& dict, Matrix& input, Matrix& output) { - std::ofstream ofs(args.output + ".bin"); +void FastText::saveModel() { + std::ofstream ofs(args_->output + ".bin"); if (!ofs.is_open()) { std::cerr << "Model file cannot be opened for saving!" << std::endl; exit(EXIT_FAILURE); } - args.save(ofs); - dict.save(ofs); - input.save(ofs); - output.save(ofs); + args_->save(ofs); + dict_->save(ofs); + input_->save(ofs); + output_->save(ofs); ofs.close(); } -void loadModel(std::string filename, Dictionary& dict, - Matrix& input, Matrix& output) { +void FastText::loadModel(const std::string& filename) { std::ifstream ifs(filename); if (!ifs.is_open()) { std::cerr << "Model file cannot be opened for loading!" << std::endl; exit(EXIT_FAILURE); } - args.load(ifs); - dict.load(ifs); - input.load(ifs); - output.load(ifs); + args_ = std::make_shared(); + dict_ = std::make_shared(args_); + input_ = std::make_shared(); + output_ = std::make_shared(); + args_->load(ifs); + dict_->load(ifs); + input_->load(ifs); + output_->load(ifs); + model_ = std::make_shared(input_, output_, args_, 0); + if (args_->model == model_name::sup) { + model_->setTargetCounts(dict_->getCounts(entry_type::label)); + } else { + model_->setTargetCounts(dict_->getCounts(entry_type::word)); + } ifs.close(); } -void printInfo(Model& model, real progress) { - real loss = info::allLoss / info::allN; - real t = real(clock() - info::start) / CLOCKS_PER_SEC; - real wst = real(info::allWords) / t; - int eta = int(t / progress * (1 - progress) / args.thread); +void FastText::printInfo(real progress, real loss) { + real t = real(clock() - start) / CLOCKS_PER_SEC; + real wst = real(tokenCount) / t; + real lr = args_->lr * (1.0 - progress); + int eta = int(t / progress * (1 - progress) / args_->thread); int etah = eta / 3600; int etam = (eta - etah * 3600) / 60; std::cout << std::fixed; std::cout << "\rProgress: " << std::setprecision(1) << 100 * progress << "%"; std::cout << " words/sec/thread: " << std::setprecision(0) << wst; - std::cout << " lr: " << std::setprecision(6) << model.getLearningRate(); + std::cout << " lr: " << std::setprecision(6) << lr; std::cout << " loss: " << std::setprecision(6) << loss; std::cout << " eta: " << etah << "h" << etam << "m "; std::cout << std::flush; } -void supervised(Model& model, - const std::vector& line, - const std::vector& labels, - double& loss, int32_t& nexamples) { +void FastText::supervised(Model& model, real lr, + const std::vector& line, + const std::vector& labels) { if (labels.size() == 0 || line.size() == 0) return; std::uniform_int_distribution<> uniform(0, labels.size() - 1); int32_t i = uniform(model.rng); - loss += model.update(line, labels[i]); - nexamples++; + model.update(line, labels[i], lr); } -void cbow(Dictionary& dict, Model& model, - const std::vector& line, - double& loss, int32_t& nexamples) { +void FastText::cbow(Model& model, real lr, + const std::vector& line) { std::vector bow; - std::uniform_int_distribution<> uniform(1, args.ws); + std::uniform_int_distribution<> uniform(1, args_->ws); for (int32_t w = 0; w < line.size(); w++) { int32_t boundary = uniform(model.rng); bow.clear(); for (int32_t c = -boundary; c <= boundary; c++) { if (c != 0 && w + c >= 0 && w + c < line.size()) { - const std::vector& ngrams = dict.getNgrams(line[w + c]); + const std::vector& ngrams = dict_->getNgrams(line[w + c]); bow.insert(bow.end(), ngrams.cbegin(), ngrams.cend()); } } - loss += model.update(bow, line[w]); - nexamples++; + model.update(bow, line[w], lr); } } -void skipgram(Dictionary& dict, Model& model, - const std::vector& line, - double& loss, int32_t& nexamples) { - std::uniform_int_distribution<> uniform(1, args.ws); +void FastText::skipgram(Model& model, real lr, + const std::vector& line) { + std::uniform_int_distribution<> uniform(1, args_->ws); for (int32_t w = 0; w < line.size(); w++) { int32_t boundary = uniform(model.rng); - const std::vector& ngrams = dict.getNgrams(line[w]); + const std::vector& ngrams = dict_->getNgrams(line[w]); for (int32_t c = -boundary; c <= boundary; c++) { if (c != 0 && w + c >= 0 && w + c < line.size()) { - loss += model.update(ngrams, line[w + c]); - nexamples++; + model.update(ngrams, line[w + c], lr); } } } } -void test(Dictionary& dict, Model& model, std::string filename, int32_t k) { +void FastText::test(const std::string& filename, int32_t k) { int32_t nexamples = 0, nlabels = 0; double precision = 0.0; std::vector line, labels; @@ -171,11 +158,11 @@ void test(Dictionary& dict, Model& model, std::string filename, int32_t k) { exit(EXIT_FAILURE); } while (ifs.peek() != EOF) { - dict.getLine(ifs, line, labels, model.rng); - dict.addNgrams(line, args.wordNgrams); + dict_->getLine(ifs, line, labels, model_->rng); + dict_->addNgrams(line, args_->wordNgrams); if (labels.size() > 0 && line.size() > 0) { std::vector> predictions; - model.predict(line, k, predictions); + model_->predict(line, k, predictions); for (auto it = predictions.cbegin(); it != predictions.cend(); it++) { if (std::find(labels.begin(), labels.end(), it->second) != labels.end()) { precision += 1.0; @@ -192,7 +179,7 @@ void test(Dictionary& dict, Model& model, std::string filename, int32_t k) { std::cout << "Number of examples: " << nexamples << std::endl; } -void predict(Dictionary& dict, Model& model, std::string filename, int32_t k) { +void FastText::predict(const std::string& filename, int32_t k, bool print_prob) { std::vector line, labels; std::ifstream ifs(filename); if (!ifs.is_open()) { @@ -200,83 +187,114 @@ void predict(Dictionary& dict, Model& model, std::string filename, int32_t k) { exit(EXIT_FAILURE); } while (ifs.peek() != EOF) { - dict.getLine(ifs, line, labels, model.rng); - dict.addNgrams(line, args.wordNgrams); + dict_->getLine(ifs, line, labels, model_->rng); + dict_->addNgrams(line, args_->wordNgrams); if (line.empty()) { std::cout << "n/a" << std::endl; continue; } std::vector> predictions; - model.predict(line, k, predictions); + model_->predict(line, k, predictions); for (auto it = predictions.cbegin(); it != predictions.cend(); it++) { if (it != predictions.cbegin()) { std::cout << ' '; } - std::cout << dict.getLabel(it->second); + std::cout << dict_->getLabel(it->second); + if (print_prob) { + std::cout << ' ' << exp(it->first); + } } std::cout << std::endl; } ifs.close(); } -void trainThread(Dictionary& dict, Matrix& input, Matrix& output, - int32_t threadId) { - std::ifstream ifs(args.input); - utils::seek(ifs, threadId * utils::size(ifs) / args.thread); +void FastText::trainThread(int32_t threadId) { + std::ifstream ifs(args_->input); + utils::seek(ifs, threadId * utils::size(ifs) / args_->thread); - Model model(input, output, args.dim, args.lr, threadId); - if (args.model == model_name::sup) { - model.setTargetCounts(dict.getCounts(entry_type::label)); + Model model(input_, output_, args_, threadId); + if (args_->model == model_name::sup) { + model.setTargetCounts(dict_->getCounts(entry_type::label)); } else { - model.setTargetCounts(dict.getCounts(entry_type::word)); + model.setTargetCounts(dict_->getCounts(entry_type::word)); } - real progress; - const int64_t ntokens = dict.ntokens(); - int64_t tokenCount = 0, printCount = 0, deltaCount = 0; - double loss = 0.0; - int32_t nexamples = 0; + const int64_t ntokens = dict_->ntokens(); + int64_t localTokenCount = 0; std::vector line, labels; - while (info::allWords < args.epoch * ntokens) { - deltaCount = dict.getLine(ifs, line, labels, model.rng); - tokenCount += deltaCount; - printCount += deltaCount; - if (args.model == model_name::sup) { - dict.addNgrams(line, args.wordNgrams); - supervised(model, line, labels, loss, nexamples); - } else if (args.model == model_name::cbow) { - cbow(dict, model, line, loss, nexamples); - } else if (args.model == model_name::sg) { - skipgram(dict, model, line, loss, nexamples); + while (tokenCount < args_->epoch * ntokens) { + real progress = real(tokenCount) / (args_->epoch * ntokens); + real lr = args_->lr * (1.0 - progress); + localTokenCount += dict_->getLine(ifs, line, labels, model.rng); + if (args_->model == model_name::sup) { + dict_->addNgrams(line, args_->wordNgrams); + supervised(model, lr, line, labels); + } else if (args_->model == model_name::cbow) { + cbow(model, lr, line); + } else if (args_->model == model_name::sg) { + skipgram(model, lr, line); } - if (tokenCount > args.lrUpdateRate) { - info::allWords += tokenCount; - info::allLoss += loss; - info::allN += nexamples; - tokenCount = 0; - loss = 0.0; - nexamples = 0; - progress = real(info::allWords) / (args.epoch * ntokens); - model.setLearningRate(args.lr * (1.0 - progress)); + if (localTokenCount > args_->lrUpdateRate) { + tokenCount += localTokenCount; + localTokenCount = 0; if (threadId == 0) { - printInfo(model, progress); + printInfo(progress, model.getLoss()); } } } if (threadId == 0) { - printInfo(model, 1.0); + printInfo(1.0, model.getLoss()); std::cout << std::endl; } ifs.close(); } +void FastText::train(std::shared_ptr args) { + args_ = args; + dict_ = std::make_shared(args_); + std::ifstream ifs(args_->input); + if (!ifs.is_open()) { + std::cerr << "Input file cannot be opened!" << std::endl; + exit(EXIT_FAILURE); + } + dict_->readFromFile(ifs); + ifs.close(); + + input_ = std::make_shared(dict_->nwords()+args_->bucket, args_->dim); + if (args_->model == model_name::sup) { + output_ = std::make_shared(dict_->nlabels(), args_->dim); + } else { + output_ = std::make_shared(dict_->nwords(), args_->dim); + } + input_->uniform(1.0 / args_->dim); + output_->zero(); + + start = clock(); + tokenCount = 0; + std::vector threads; + for (int32_t i = 0; i < args_->thread; i++) { + threads.push_back(std::thread([=]() { trainThread(i); })); + } + for (auto it = threads.begin(); it != threads.end(); ++it) { + it->join(); + } + model_ = std::make_shared(input_, output_, args_, 0); + + saveModel(); + if (args_->model != model_name::sup) { + saveVectors(); + } +} + void printUsage() { std::cout << "usage: fasttext \n\n" << "The commands supported by fasttext are:\n\n" << " supervised train a supervised classifier\n" << " test evaluate a supervised classifier\n" - << " predict predict most likely label\n" + << " predict predict most likely labels\n" + << " predict-prob predict most likely labels with probabilities\n" << " skipgram train a skipgram model\n" << " cbow train a cbow model\n" << " print-vectors print vectors given a trained model\n" @@ -294,7 +312,7 @@ void printTestUsage() { void printPredictUsage() { std::cout - << "usage: fasttext predict []\n\n" + << "usage: fasttext predict[-prob] []\n\n" << " model filename\n" << " test data filename\n" << " (optional; 1 by default) predict top k labels\n" @@ -318,12 +336,9 @@ void test(int argc, char** argv) { printTestUsage(); exit(EXIT_FAILURE); } - Dictionary dict; - Matrix input, output; - loadModel(std::string(argv[2]), dict, input, output); - Model model(input, output, args.dim, args.lr, 1); - model.setTargetCounts(dict.getCounts(entry_type::label)); - test(dict, model, std::string(argv[3]), k); + FastText fasttext; + fasttext.loadModel(std::string(argv[2])); + fasttext.test(std::string(argv[3]), k); exit(0); } @@ -337,12 +352,10 @@ void predict(int argc, char** argv) { printPredictUsage(); exit(EXIT_FAILURE); } - Dictionary dict; - Matrix input, output; - loadModel(std::string(argv[2]), dict, input, output); - Model model(input, output, args.dim, args.lr, 1); - model.setTargetCounts(dict.getCounts(entry_type::label)); - predict(dict, model, std::string(argv[3]), k); + bool print_prob = std::string(argv[1]) == "predict-prob"; + FastText fasttext; + fasttext.loadModel(std::string(argv[2])); + fasttext.predict(std::string(argv[3]), k, print_prob); exit(0); } @@ -351,52 +364,17 @@ void printVectors(int argc, char** argv) { printPrintVectorsUsage(); exit(EXIT_FAILURE); } - Dictionary dict; - Matrix input, output; - loadModel(std::string(argv[2]), dict, input, output); - printVectors(dict, input); + FastText fasttext; + fasttext.loadModel(std::string(argv[2])); + fasttext.printVectors(); exit(0); } void train(int argc, char** argv) { - args.parseArgs(argc, argv); - - Dictionary dict; - std::ifstream ifs(args.input); - if (!ifs.is_open()) { - std::cerr << "Input file cannot be opened!" << std::endl; - exit(EXIT_FAILURE); - } - dict.readFromFile(ifs); - ifs.close(); - - Matrix input(dict.nwords() + args.bucket, args.dim); - Matrix output; - if (args.model == model_name::sup) { - output = Matrix(dict.nlabels(), args.dim); - } else { - output = Matrix(dict.nwords(), args.dim); - } - input.uniform(1.0 / args.dim); - output.zero(); - - info::start = clock(); - time_t t0 = time(nullptr); - std::vector threads; - for (int32_t i = 0; i < args.thread; i++) { - threads.push_back(std::thread(&trainThread, std::ref(dict), - std::ref(input), std::ref(output), i)); - } - for (auto it = threads.begin(); it != threads.end(); ++it) { - it->join(); - } - double trainTime = difftime(time(nullptr), t0); - std::cout << "Train time: " << trainTime << " sec" << std::endl; - - saveModel(dict, input, output); - if (args.model != model_name::sup) { - saveVectors(dict, input, output); - } + std::shared_ptr a = std::make_shared(); + a->parseArgs(argc, argv); + FastText fasttext; + fasttext.train(a); } int main(int argc, char** argv) { @@ -412,7 +390,7 @@ int main(int argc, char** argv) { test(argc, argv); } else if (command == "print-vectors") { printVectors(argc, argv); - } else if (command == "predict") { + } else if (command == "predict" || command == "predict-prob" ) { predict(argc, argv); } else { printUsage(); diff --git a/fasttext/cpp/src/fasttext.h b/fasttext/cpp/src/fasttext.h new file mode 100644 index 0000000..777a105 --- /dev/null +++ b/fasttext/cpp/src/fasttext.h @@ -0,0 +1,52 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + */ + +#ifndef FASTTEXT_FASTTEXT_H +#define FASTTEXT_FASTTEXT_H + +#include +#include + +#include "matrix.h" +#include "vector.h" +#include "dictionary.h" +#include "model.h" +#include "utils.h" +#include "real.h" +#include "args.h" + +class FastText { + private: + std::shared_ptr args_; + std::shared_ptr dict_; + std::shared_ptr input_; + std::shared_ptr output_; + std::shared_ptr model_; + std::atomic tokenCount; + clock_t start; + + public: + void getVector(Vector&, const std::string&); + void saveVectors(); + void printVectors(); + void saveModel(); + void loadModel(const std::string&); + void printInfo(real, real); + + void supervised(Model&, real, const std::vector&, + const std::vector&); + void cbow(Model&, real, const std::vector&); + void skipgram(Model&, real, const std::vector&); + void test(const std::string&, int32_t); + void predict(const std::string&, int32_t, bool); + void trainThread(int32_t); + void train(std::shared_ptr); +}; + +#endif diff --git a/fasttext/cpp/src/matrix.cc b/fasttext/cpp/src/matrix.cc index 68d880c..9a60ea2 100644 --- a/fasttext/cpp/src/matrix.cc +++ b/fasttext/cpp/src/matrix.cc @@ -83,16 +83,16 @@ real Matrix::dotRow(const Vector& vec, int64_t i) { return d; } -void Matrix::save(std::ofstream& ofs) { - ofs.write((char*) &m_, sizeof(int64_t)); - ofs.write((char*) &n_, sizeof(int64_t)); - ofs.write((char*) data_, m_ * n_ * sizeof(real)); +void Matrix::save(std::ostream& out) { + out.write((char*) &m_, sizeof(int64_t)); + out.write((char*) &n_, sizeof(int64_t)); + out.write((char*) data_, m_ * n_ * sizeof(real)); } -void Matrix::load(std::ifstream& ifs) { - ifs.read((char*) &m_, sizeof(int64_t)); - ifs.read((char*) &n_, sizeof(int64_t)); +void Matrix::load(std::istream& in) { + in.read((char*) &m_, sizeof(int64_t)); + in.read((char*) &n_, sizeof(int64_t)); delete[] data_; data_ = new real[m_ * n_]; - ifs.read((char*) data_, m_ * n_ * sizeof(real)); + in.read((char*) data_, m_ * n_ * sizeof(real)); } diff --git a/fasttext/cpp/src/matrix.h b/fasttext/cpp/src/matrix.h index 691619c..1eb903a 100644 --- a/fasttext/cpp/src/matrix.h +++ b/fasttext/cpp/src/matrix.h @@ -11,7 +11,8 @@ #define FASTTEXT_MATRIX_H #include -#include +#include +#include #include "real.h" @@ -35,8 +36,8 @@ class Matrix { real dotRow(const Vector&, int64_t); void addRow(const Vector&, int64_t, real); - void save(std::ofstream&); - void load(std::ifstream&); + void save(std::ostream&); + void load(std::istream&); }; #endif diff --git a/fasttext/cpp/src/model.cc b/fasttext/cpp/src/model.cc index 98089a8..cc19087 100644 --- a/fasttext/cpp/src/model.cc +++ b/fasttext/cpp/src/model.cc @@ -13,36 +13,30 @@ #include -#include "args.h" #include "utils.h" -extern Args args; - -real Model::lr_ = MIN_LR; - -Model::Model(Matrix& wi, Matrix& wo, int32_t hsz, real lr, int32_t seed) - : wi_(wi), wo_(wo), hidden_(hsz), output_(wo.m_), - grad_(hsz), rng(seed) { - isz_ = wi.m_; - osz_ = wo.m_; - hsz_ = hsz; - lr_ = lr; +Model::Model(std::shared_ptr wi, + std::shared_ptr wo, + std::shared_ptr args, + int32_t seed) + : hidden_(args->dim), output_(wo->m_), grad_(args->dim), rng(seed) +{ + wi_ = wi; + wo_ = wo; + args_ = args; + isz_ = wi->m_; + osz_ = wo->m_; + hsz_ = args->dim; negpos = 0; + loss_ = 0.0; + nexamples_ = 1; } -void Model::setLearningRate(real lr) { - lr_ = (lr < MIN_LR) ? MIN_LR : lr; -} - -real Model::getLearningRate() { - return lr_; -} - -real Model::binaryLogistic(int32_t target, bool label) { - real score = utils::sigmoid(wo_.dotRow(hidden_, target)); - real alpha = lr_ * (real(label) - score); - grad_.addRow(wo_, target, alpha); - wo_.addRow(hidden_, target, alpha); +real Model::binaryLogistic(int32_t target, bool label, real lr) { + real score = utils::sigmoid(wo_->dotRow(hidden_, target)); + real alpha = lr * (real(label) - score); + grad_.addRow(*wo_, target, alpha); + wo_->addRow(hidden_, target, alpha); if (label) { return -utils::log(score); } else { @@ -50,33 +44,32 @@ real Model::binaryLogistic(int32_t target, bool label) { } } -real Model::negativeSampling(int32_t target) { +real Model::negativeSampling(int32_t target, real lr) { real loss = 0.0; grad_.zero(); - for (int32_t n = 0; n <= args.neg; n++) { + for (int32_t n = 0; n <= args_->neg; n++) { if (n == 0) { - loss += binaryLogistic(target, true); + loss += binaryLogistic(target, true, lr); } else { - loss += binaryLogistic(getNegative(target), false); + loss += binaryLogistic(getNegative(target), false, lr); } } return loss; } -real Model::hierarchicalSoftmax(int32_t target) { +real Model::hierarchicalSoftmax(int32_t target, real lr) { real loss = 0.0; grad_.zero(); const std::vector& binaryCode = codes[target]; const std::vector& pathToRoot = paths[target]; for (int32_t i = 0; i < pathToRoot.size(); i++) { - loss += binaryLogistic(pathToRoot[i], binaryCode[i]); + loss += binaryLogistic(pathToRoot[i], binaryCode[i], lr); } return loss; } -real Model::softmax(int32_t target) { - grad_.zero(); - output_.mul(wo_, hidden_); +void Model::computeOutputSoftmax() { + output_.mul(*wo_, hidden_); real max = output_[0], z = 0.0; for (int32_t i = 0; i < osz_; i++) { max = std::max(output_[i], max); @@ -86,11 +79,18 @@ real Model::softmax(int32_t target) { z += output_[i]; } for (int32_t i = 0; i < osz_; i++) { - real label = (i == target) ? 1.0 : 0.0; output_[i] /= z; - real alpha = lr_ * (label - output_[i]); - grad_.addRow(wo_, i, alpha); - wo_.addRow(hidden_, i, alpha); + } +} + +real Model::softmax(int32_t target, real lr) { + grad_.zero(); + computeOutputSoftmax(); + for (int32_t i = 0; i < osz_; i++) { + real label = (i == target) ? 1.0 : 0.0; + real alpha = lr * (label - output_[i]); + grad_.addRow(*wo_, i, alpha); + wo_->addRow(hidden_, i, alpha); } return -utils::log(output_[target]); } @@ -98,7 +98,7 @@ real Model::softmax(int32_t target) { void Model::computeHidden(const std::vector& input) { hidden_.zero(); for (auto it = input.cbegin(); it != input.cend(); ++it) { - hidden_.addRow(wi_, *it); + hidden_.addRow(*wi_, *it); } hidden_.mul(1.0 / input.size()); } @@ -113,21 +113,21 @@ void Model::predict(const std::vector& input, int32_t k, assert(k > 0); heap.reserve(k + 1); computeHidden(input); - if (args.loss == loss_name::hs) { + if (args_->loss == loss_name::hs) { dfs(k, 2 * osz_ - 2, 0.0, heap); } else { - output_.mul(wo_, hidden_); findKBest(k, heap); } std::sort_heap(heap.begin(), heap.end(), comparePairs); } void Model::findKBest(int32_t k, std::vector>& heap) { + computeOutputSoftmax(); for (int32_t i = 0; i < osz_; i++) { - if (heap.size() == k && output_[i] < heap.front().first) { + if (heap.size() == k && utils::log(output_[i]) < heap.front().first) { continue; } - heap.push_back(std::make_pair(output_[i], i)); + heap.push_back(std::make_pair(utils::log(output_[i]), i)); std::push_heap(heap.begin(), heap.end(), comparePairs); if (heap.size() > k) { std::pop_heap(heap.begin(), heap.end(), comparePairs); @@ -152,45 +152,44 @@ void Model::dfs(int32_t k, int32_t node, real score, return; } - real f = utils::sigmoid(wo_.dotRow(hidden_, node - osz_)); + real f = utils::sigmoid(wo_->dotRow(hidden_, node - osz_)); dfs(k, tree[node].left, score + utils::log(1.0 - f), heap); dfs(k, tree[node].right, score + utils::log(f), heap); } -real Model::update(const std::vector& input, int32_t target) { +void Model::update(const std::vector& input, int32_t target, real lr) { assert(target >= 0); assert(target < osz_); - if (input.size() == 0) return 0.0; + if (input.size() == 0) return; hidden_.zero(); for (auto it = input.cbegin(); it != input.cend(); ++it) { - hidden_.addRow(wi_, *it); + hidden_.addRow(*wi_, *it); } hidden_.mul(1.0 / input.size()); - real loss; - if (args.loss == loss_name::ns) { - loss = negativeSampling(target); - } else if (args.loss == loss_name::hs) { - loss = hierarchicalSoftmax(target); + if (args_->loss == loss_name::ns) { + loss_ += negativeSampling(target, lr); + } else if (args_->loss == loss_name::hs) { + loss_ += hierarchicalSoftmax(target, lr); } else { - loss = softmax(target); + loss_ += softmax(target, lr); } + nexamples_ += 1; - if (args.model == model_name::sup) { + if (args_->model == model_name::sup) { grad_.mul(1.0 / input.size()); } for (auto it = input.cbegin(); it != input.cend(); ++it) { - wi_.addRow(grad_, *it, 1.0); + wi_->addRow(grad_, *it, 1.0); } - return loss; } void Model::setTargetCounts(const std::vector& counts) { assert(counts.size() == osz_); - if (args.loss == loss_name::ns) { + if (args_->loss == loss_name::ns) { initTableNegatives(counts); } - if (args.loss == loss_name::hs) { + if (args_->loss == loss_name::hs) { buildTree(counts); } } @@ -261,3 +260,7 @@ void Model::buildTree(const std::vector& counts) { codes.push_back(code); } } + +real Model::getLoss() { + return loss_ / nexamples_; +} diff --git a/fasttext/cpp/src/model.h b/fasttext/cpp/src/model.h index 9a6a48a..35936de 100644 --- a/fasttext/cpp/src/model.h +++ b/fasttext/cpp/src/model.h @@ -13,7 +13,9 @@ #include #include #include +#include +#include "args.h" #include "matrix.h" #include "vector.h" #include "real.h" @@ -28,16 +30,17 @@ struct Node { class Model { private: - Matrix& wi_; - Matrix& wo_; + std::shared_ptr wi_; + std::shared_ptr wo_; + std::shared_ptr args_; Vector hidden_; Vector output_; Vector grad_; int32_t hsz_; int32_t isz_; int32_t osz_; - - static real lr_; + real loss_; + int64_t nexamples_; static bool comparePairs(const std::pair&, const std::pair&); @@ -49,30 +52,29 @@ class Model { std::vector tree; static const int32_t NEGATIVE_TABLE_SIZE = 10000000; - static constexpr real MIN_LR = 0.000001; public: - Model(Matrix&, Matrix&, int32_t, real, int32_t); - - void setLearningRate(real); - real getLearningRate(); + Model(std::shared_ptr, std::shared_ptr, + std::shared_ptr, int32_t); - real binaryLogistic(int32_t, bool); - real negativeSampling(int32_t); - real hierarchicalSoftmax(int32_t); - real softmax(int32_t); + real binaryLogistic(int32_t, bool, real); + real negativeSampling(int32_t, real); + real hierarchicalSoftmax(int32_t, real); + real softmax(int32_t, real); void predict(const std::vector&, int32_t, std::vector>&); void dfs(int32_t, int32_t, real, std::vector>&); void findKBest(int32_t, std::vector>&); - real update(const std::vector&, int32_t); + void update(const std::vector&, int32_t, real); void computeHidden(const std::vector&); + void computeOutputSoftmax(); void setTargetCounts(const std::vector&); void initTableNegatives(const std::vector&); int32_t getNegative(int32_t target); void buildTree(const std::vector&); + real getLoss(); std::minstd_rand rng; }; diff --git a/fasttext/cpp/src/utils.cc b/fasttext/cpp/src/utils.cc index 98b7554..6c7e9ac 100644 --- a/fasttext/cpp/src/utils.cc +++ b/fasttext/cpp/src/utils.cc @@ -13,8 +13,8 @@ #include namespace utils { - real* t_sigmoid; - real* t_log; + real* t_sigmoid = nullptr; + real* t_log = nullptr; real log(real x) { if (x > 1.0) { @@ -41,6 +41,7 @@ namespace utils { } void initSigmoid() { + if (t_sigmoid != nullptr) return; t_sigmoid = new real[SIGMOID_TABLE_SIZE + 1]; for (int i = 0; i < SIGMOID_TABLE_SIZE + 1; i++) { real x = real(i * 2 * MAX_SIGMOID) / SIGMOID_TABLE_SIZE - MAX_SIGMOID; @@ -49,6 +50,7 @@ namespace utils { } void initLog() { + if (t_log != nullptr) return; t_log = new real[LOG_TABLE_SIZE + 1]; for (int i = 0; i < LOG_TABLE_SIZE + 1; i++) { real x = (real(i) + 1e-5) / LOG_TABLE_SIZE; @@ -69,7 +71,6 @@ namespace utils { } void seek(std::ifstream& ifs, int64_t pos) { - char c; ifs.clear(); ifs.seekg(std::streampos(pos)); } diff --git a/fasttext/cpp/src/vector.cc b/fasttext/cpp/src/vector.cc index 2072bb3..5163879 100644 --- a/fasttext/cpp/src/vector.cc +++ b/fasttext/cpp/src/vector.cc @@ -11,7 +11,6 @@ #include -#include #include #include "matrix.h" diff --git a/fasttext/cpp/src/vector.h b/fasttext/cpp/src/vector.h index a40aaa4..14aef33 100644 --- a/fasttext/cpp/src/vector.h +++ b/fasttext/cpp/src/vector.h @@ -11,7 +11,7 @@ #define FASTTEXT_VECTOR_H #include -#include +#include #include "real.h" diff --git a/update-fasttext.sh b/update-fasttext.sh index b502385..7f600c5 100644 --- a/update-fasttext.sh +++ b/update-fasttext.sh @@ -1,4 +1,4 @@ -NEW_VERSION=3223526 +NEW_VERSION=fabb04e CURRENT_VERSION=$(cat fasttext/cpp/LAST_COMMIT) if [ "$NEW_VERSION" = "$CURRENT_VERSION" ]; then From 3854cff5e0f1d1dc27208c04a15339f46764348b Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 00:35:28 +0700 Subject: [PATCH 082/109] Add compatibility for fastText version fabb04e --- fasttext/fasttext.pyx | 35 +++++-- fasttext/interface.cc | 208 ++++++++++++++++++++++++----------------- fasttext/interface.h | 24 +++-- fasttext/interface.pxd | 31 +++++- fasttext/model.py | 54 +++++------ 5 files changed, 219 insertions(+), 133 deletions(-) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index f6bbdce..af27e80 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -24,8 +24,24 @@ cdef class FastTextModelWrapper: def __cinit__(self): self.fm = FastTextModel() - def get_words(self): - return self.words + # dict_* methods is a wrapper for the Dictionary class methods; + # We can't access dicrectly Dictionary in python because + # Dictionary class doesn't have a nullary constructor + def dict_nwords(self): + return self.fm.dictGetNWords() + + def dict_get_word(self, i): + cdef string cpp_string + cpp_string = self.fm.dictGetWord(i) + return cpp_string.decode('utf-8') + + def dict_nlabels(self): + return self.fm.dictGetNLabels() + + def dict_get_label(self, i): + cdef string cpp_string + cpp_string = self.fm.dictGetLabel(i) + return cpp_string.decode('utf-8') def get_vector(self, word): word_bytes = bytes(word, 'utf-8') @@ -117,25 +133,24 @@ def load_model(filename, label_prefix=''): model = FastTextModelWrapper() filename_bytes = bytes(filename, 'utf-8') try: + # How we load the dictionary loadModelWrapper(filename_bytes, model.fm) except: raise Exception('fastText: Cannot load ' + filename + ' due to C++ extension failed to allocate the memory') model_name = model.fm.modelName - dictionary = model.fm.getDictionary() - cdef string cpp_string if model_name == 'skipgram' or model_name == 'cbow': words = [] - for i in xrange(dictionary.nwords()): - cpp_string = dictionary.getWord(i) - words.append(cpp_string.decode('utf-8')) + # We build the dictionary here to support unicode characters + for i in xrange(model.get_nwords()): + word = model.get_word(i) + words.append(word) return WordVectorModel(model, words) elif model_name == 'supervised': labels = [] - for i in xrange(dictionary.nlabels()): - cpp_string = dictionary.getLabel(i) - label = cpp_string.decode('utf-8') + for i in xrange(model.get_nlabels()): + label = model.get_label(i) # Remove the prefix labels.append(label.replace(label_prefix, '')) return SupervisedModel(model, labels, label_prefix) diff --git a/fasttext/interface.cc b/fasttext/interface.cc index b3d303e..68c4f93 100644 --- a/fasttext/interface.cc +++ b/fasttext/interface.cc @@ -1,8 +1,11 @@ /* An interface for fastText */ #include +#include #include #include #include +#include +#include #include "interface.h" #include "cpp/src/real.h" @@ -11,8 +14,7 @@ #include "cpp/src/matrix.h" #include "cpp/src/vector.h" #include "cpp/src/model.h" - -#include "cpp/src/fasttext.cc" +#include "cpp/src/fasttext.h" FastTextModel::FastTextModel(){} @@ -21,65 +23,98 @@ std::vector FastTextModel::getWords() return _words; } -Dictionary FastTextModel::getDictionary() -{ - return _dict; -} - void FastTextModel::addWord(std::string word) { _words.push_back(word); } -void FastTextModel::setDict(Dictionary dict) -{ - _dict = dict; -} - -void FastTextModel::setMatrix(Matrix& input, Matrix& output) -{ - _input_matrix = input; - _output_matrix = output; -} - -void FastTextModel::setArg(Args arg) +void FastTextModel::setArgs(std::shared_ptr args) { - dim = arg.dim; - ws = arg.ws; - epoch = arg.epoch; - minCount = arg.minCount; - neg = arg.neg; - wordNgrams = arg.wordNgrams; - if(arg.loss == loss_name::ns) { + dim = args->dim; + ws = args->ws; + epoch = args->epoch; + minCount = args->minCount; + neg = args->neg; + wordNgrams = args->wordNgrams; + if(args->loss == loss_name::ns) { lossName = "ns"; } - if(arg.loss == loss_name::hs) { + if(args->loss == loss_name::hs) { lossName = "hs"; } - if(arg.loss == loss_name::softmax) { + if(args->loss == loss_name::softmax) { lossName = "softmax"; } - if(arg.model == model_name::cbow) { + if(args->model == model_name::cbow) { modelName = "cbow"; } - if(arg.model == model_name::sg) { + if(args->model == model_name::sg) { modelName = "skipgram"; } - if(arg.model == model_name::sup) { + if(args->model == model_name::sup) { modelName = "supervised"; } - bucket = arg.bucket; - minn = arg.minn; - maxn = arg.maxn; - lrUpdateRate = arg.lrUpdateRate; - t = arg.t; - lr = arg.lr; + bucket = args->bucket; + minn = args->minn; + maxn = args->maxn; + lrUpdateRate = args->lrUpdateRate; + t = args->t; + lr = args->lr; +} + +void FastTextModel::setDictionary(std::shared_ptr dict) +{ + _dict = dict; +} + +void FastTextModel::setMatrix(std::shared_ptr input, + std::shared_ptr output) +{ + _input_matrix = input; + _output_matrix = output; +} + +void FastTextModel::setModel(std::shared_ptr model) +{ + _model = model; } +/* Methods to wrap the Dictionary methods; since we can't access + * dicrectly Dictionary in python because Dictionary doesn't have + * nullary constructor */ +int32_t FastTextModel::dictGetNWords() +{ + return _dict->nwords(); +} + +std::string FastTextModel::dictGetWord(int32_t i) +{ + return _dict->getWord(i); +} + +int32_t FastTextModel::dictGetNLabels() +{ + return _dict->nlabels(); +} + +std::string FastTextModel::dictGetLabel(int32_t i) +{ + return _dict->getLabel(i); +} + +/* We use the same logic as FastText::getVector here; Because + * we need to access our own dictionary and input matrix */ std::vector FastTextModel::getVectorWrapper(std::string word) { Vector vec(dim); - getVector(_dict, _input_matrix, vec, word); + const std::vector& ngrams = _dict->getNgrams(word); + vec.zero(); + for (auto it = ngrams.begin(); it != ngrams.end(); ++it) { + vec.addRow(*_input_matrix, *it); + } + if (ngrams.size() > 0) { + vec.mul(1.0 / ngrams.size()); + } std::vector vector(vec.data_, vec.data_ + vec.m_); return vector; } @@ -87,15 +122,6 @@ std::vector FastTextModel::getVectorWrapper(std::string word) std::vector FastTextModel::classifierTest(std::string filename, int32_t k) { - /* Initialize the model - * We use default value of learning rate here, since the fasttext(1) test - * command also use the default value. - * https://github.com/facebookresearch/fastText/blob/9bfa32d/src/fasttext.cc#L307 - * (generated model.bin file doesn't contain the learning rate info, args.lr - * will have the default value when model.bin loaded) */ - Model model(_input_matrix, _output_matrix, dim, args.lr, 1); - model.setTargetCounts(_dict.getCounts(entry_type::label)); - int32_t nexamples = 0; int32_t nlabels = 0; double precision = 0.0; @@ -108,11 +134,11 @@ std::vector FastTextModel::classifierTest(std::string filename, } while (ifs.peek() != EOF) { - _dict.getLine(ifs, line, labels, model.rng); - _dict.addNgrams(line, wordNgrams); + _dict->getLine(ifs, line, labels, _model->rng); + _dict->addNgrams(line, wordNgrams); if(labels.size() > 0 && line.size() > 0) { std::vector> predictions; - model.predict(line, k, predictions); + _model->predict(line, k, predictions); for(auto it = predictions.cbegin(); it != predictions.cend(); it++) { int32_t i = it->second; @@ -138,17 +164,6 @@ std::vector FastTextModel::classifierTest(std::string filename, std::vector FastTextModel::classifierPredict(std::string text, int32_t k) { - /* Initialize the model - * We use default value of learning rate here, since the fasttext(1) test - * command also use the default value. - * https://github.com/facebookresearch/fastText/blob/9bfa32d/src/fasttext.cc#L307 - * (generated model.bin file doesn't contain the learning rate info, args.lr - * will have the default value when model.bin loaded) */ - Model model(_input_matrix, _output_matrix, dim, args.lr, 1); - model.setTargetCounts(_dict.getCounts(entry_type::label)); - std::minstd_rand rng = model.rng; - std::uniform_real_distribution<> uniform(0, 1); - /* Hardcoded here; since we need this variable but the variable * is private in dictionary.h */ const int32_t max_line_size = 1024; @@ -159,24 +174,26 @@ std::vector FastTextModel::classifierPredict(std::string text, std::string token; /* We implement the same logic as Dictionary::getLine */ + std::uniform_real_distribution<> uniform(0, 1); while(iss >> token) { - int32_t word_id = _dict.getId(token); + int32_t word_id = _dict->getId(token); if(word_id < 0) continue; - entry_type type = _dict.getType(word_id); - if (type == entry_type::word && !_dict.discard(word_id, uniform(rng))) { + entry_type type = _dict->getType(word_id); + if (type == entry_type::word && + !_dict->discard(word_id, uniform(_model->rng))) { text_word_ids.push_back(word_id); } if(text_word_ids.size() > max_line_size) break; } - _dict.addNgrams(text_word_ids, wordNgrams); + _dict->addNgrams(text_word_ids, wordNgrams); std::vector labels; if(text_word_ids.size() > 0) { std::vector> predictions; - model.predict(text_word_ids, k, predictions); + _model->predict(text_word_ids, k, predictions); for(auto it = predictions.cbegin(); it != predictions.cend(); it++) { - labels.push_back(_dict.getLabel(it->second)); + labels.push_back(_dict->getLabel(it->second)); } return labels; @@ -196,33 +213,52 @@ void trainWrapper(int argc, char **argv, int silent) /* if silent > 0, the log from train() function will be supressed */ if(silent > 0) { std::cout.rdbuf(new_ofs.rdbuf()); - train(argc, argv); + std::shared_ptr a = std::make_shared(); + a->parseArgs(argc, argv); + FastText fasttext; + fasttext.train(a); std::cout.rdbuf(old_ofs); } else { - train(argc, argv); + std::shared_ptr a = std::make_shared(); + a->parseArgs(argc, argv); + FastText fasttext; + fasttext.train(a); } new_ofs.close(); } +/* The logic is the same as FastText::loadModel, we roll our own + * to be able to access data from args, dictionary etc since this + * data is private in FastText class */ void loadModelWrapper(std::string filename, FastTextModel& model) { - Dictionary dict; - Matrix input, output; - loadModel(filename, dict, input, output); - - /* args is defined globally in cpp/src/fasttext.cc - * We parse it to the model, so we not depend on it anymore */ - model.setArg(args); - model.setDict(dict); - model.setMatrix(input, output); - - /* Do the indexing on Cython to support unicode instead of plain - * bytes */ - /* - for(int32_t i = 0; i < dict.nwords(); i++) { - std::string word = dict.getWord(i); - model.addWord(word); + std::ifstream ifs(filename); + if (!ifs.is_open()) { + std::cerr << "interface.cc: cannot load model file "; + std::cerr << filename << std::endl; + exit(EXIT_FAILURE); + } + std::shared_ptr args = std::make_shared(); + std::shared_ptr dict = std::make_shared(args); + std::shared_ptr input_matrix = std::make_shared(); + std::shared_ptr output_matrix = std::make_shared(); + args->load(ifs); + dict->load(ifs); + input_matrix->load(ifs); + output_matrix->load(ifs); + std::shared_ptr model_p = std::make_shared(input_matrix, + output_matrix, args, 0); + if (args->model == model_name::sup) { + model_p->setTargetCounts(dict->getCounts(entry_type::label)); + } else { + model_p->setTargetCounts(dict->getCounts(entry_type::word)); } - */ + ifs.close(); + + /* save all data to FastTextModel */ + model.setArgs(args); + model.setDictionary(dict); + model.setMatrix(input_matrix, output_matrix); + model.setModel(model_p); } diff --git a/fasttext/interface.h b/fasttext/interface.h index 11fbe74..aea2eb0 100644 --- a/fasttext/interface.h +++ b/fasttext/interface.h @@ -8,13 +8,15 @@ #include "cpp/src/args.h" #include "cpp/src/dictionary.h" #include "cpp/src/matrix.h" +#include "cpp/src/model.h" class FastTextModel { private: std::vector _words; - Dictionary _dict; - Matrix _input_matrix; - Matrix _output_matrix; + std::shared_ptr _dict; + std::shared_ptr _input_matrix; + std::shared_ptr _output_matrix; + std::shared_ptr _model; public: FastTextModel(); @@ -39,11 +41,17 @@ class FastTextModel { std::vector classifierPredict(std::string text, int32_t k); void addWord(std::string word); - void setDict(Dictionary dict); - void setMatrix(Matrix& input, Matrix& output); - void setArg(Args arg); - - Dictionary getDictionary(); + void setArgs(std::shared_ptr args); + void setDictionary(std::shared_ptr dict); + void setMatrix(std::shared_ptr input, + std::shared_ptr output); + void setModel(std::shared_ptr model); + + /* wrapper for Dictionary class */ + int32_t dictGetNWords(); + std::string dictGetWord(int32_t i); + int32_t dictGetNLabels(); + std::string dictGetLabel(int32_t i); }; void trainWrapper(int argc, char **argv, int silent); diff --git a/fasttext/interface.pxd b/fasttext/interface.pxd index 212a3f6..0d1d543 100644 --- a/fasttext/interface.pxd +++ b/fasttext/interface.pxd @@ -2,13 +2,18 @@ from libcpp.string cimport string from libcpp.vector cimport vector from libc.stdint cimport int32_t +from libcpp.memory cimport shared_ptr cdef extern from "cpp/src/real.h": ctypedef float real +cdef extern from "cpp/src/args.h": + cdef cppclass Args: + Args() + cdef extern from "cpp/src/dictionary.h": cdef cppclass Dictionary: - Dictionary() + Dictionary(shared_ptr[Args]) int32_t nwords() int32_t nlabels() @@ -16,6 +21,24 @@ cdef extern from "cpp/src/dictionary.h": string getWord(int32_t) string getLabel(int32_t) +# Because Dictionary doesn't have nulary constructor +# We create a class wrapper with __cinit__ and __dealloc__ methods which are +# guaranteed to be called exactly once upon creation and deletion of the Python +# instance. +# http://cython.readthedocs.io/en/latest/src/userguide/wrapping_CPlusPlus.html#create-cython-wrapper-class +# cdef class DictionaryWrapper: +# cdef Dictionary* dictionary_ptr +# +# def __cinit__(self, shared_ptr[Args] args): +# dictionary_ptr = new Dictionary(args) +# +# def __dealloc__(self): +# del self.dictionary_ptr +# +# def get_word(self, i): +# return self.dictioanry_ptr.getWord(i) + + cdef extern from "interface.h": cdef cppclass FastTextModel: FastTextModel() @@ -38,7 +61,11 @@ cdef extern from "interface.h": vector[double] classifierTest(string filename, int32_t k) vector[string] classifierPredict(string text, int32_t k) - Dictionary getDictionary() + # Wrapper for Dictionary class + int32_t dictGetNWords() + string dictGetWord(int32_t i) + int32_t dictGetNLabels() + string dictGetLabel(int32_t i) void trainWrapper(int argc, char **argvm, int silent) diff --git a/fasttext/model.py b/fasttext/model.py index ba8e8c5..6915b53 100644 --- a/fasttext/model.py +++ b/fasttext/model.py @@ -7,19 +7,19 @@ class WordVectorModel(object): def __init__(self, model, words): self._model = model self.words = words - self.dim = model.dim; - self.ws = model.ws; - self.epoch = model.epoch; - self.min_count = model.minCount; - self.neg = model.neg; - self.word_ngrams = model.wordNgrams; - self.loss_name = model.lossName.decode('utf-8'); - self.model_name = model.modelName.decode('utf-8'); - self.bucket = model.bucket; - self.minn = model.minn; - self.maxn = model.maxn; - self.lr_update_rate = model.lrUpdateRate; - self.t = model.t; + self.dim = model.dim + self.ws = model.ws + self.epoch = model.epoch + self.min_count = model.minCount + self.neg = model.neg + self.word_ngrams = model.wordNgrams + self.loss_name = model.lossName.decode('utf-8') + self.model_name = model.modelName.decode('utf-8') + self.bucket = model.bucket + self.minn = model.minn + self.maxn = model.maxn + self.lr_update_rate = model.lrUpdateRate + self.t = model.t def __getitem__(self, word): return self._model.get_vector(word) @@ -39,20 +39,20 @@ class SupervisedModel(object): def __init__(self, model, labels, label_prefix): self._model = model self.labels = labels - self.dim = model.dim; - self.ws = model.ws; - self.epoch = model.epoch; - self.min_count = model.minCount; - self.neg = model.neg; - self.word_ngrams = model.wordNgrams; - self.loss_name = model.lossName.decode('utf-8'); - self.model_name = model.modelName.decode('utf-8'); - self.bucket = model.bucket; - self.minn = model.minn; - self.maxn = model.maxn; - self.lr_update_rate = model.lrUpdateRate; - self.t = model.t; - self.label_prefix = label_prefix; + self.dim = model.dim + self.ws = model.ws + self.epoch = model.epoch + self.min_count = model.minCount + self.neg = model.neg + self.word_ngrams = model.wordNgrams + self.loss_name = model.lossName.decode('utf-8') + self.model_name = model.modelName.decode('utf-8') + self.bucket = model.bucket + self.minn = model.minn + self.maxn = model.maxn + self.lr_update_rate = model.lrUpdateRate + self.t = model.t + self.label_prefix = label_prefix def test(self, test_file, k=1): return self._model.classifier_test(test_file, k) From caeb635a6eaf9b6d21c6dd8e3e7fe93f2cff7469 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 00:40:46 +0700 Subject: [PATCH 083/109] Fix undefined symbol _ZN8FastText5trainESt10shared_ptrI4Args --- fasttext/interface.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fasttext/interface.h b/fasttext/interface.h index aea2eb0..0ab2537 100644 --- a/fasttext/interface.h +++ b/fasttext/interface.h @@ -3,6 +3,7 @@ #include #include +#include #include "cpp/src/real.h" #include "cpp/src/args.h" From 4f98471c74b93464d2fc4484ccbfa1b7d940fcc6 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 00:57:30 +0700 Subject: [PATCH 084/109] Fix compatibility for skipgram and CBOW model against the fasttext version fabb04e --- fasttext/fasttext.pyx | 8 ++++---- setup.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index af27e80..b1796af 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -143,14 +143,14 @@ def load_model(filename, label_prefix=''): if model_name == 'skipgram' or model_name == 'cbow': words = [] # We build the dictionary here to support unicode characters - for i in xrange(model.get_nwords()): - word = model.get_word(i) + for i in xrange(model.dict_nwords()): + word = model.dict_get_word(i) words.append(word) return WordVectorModel(model, words) elif model_name == 'supervised': labels = [] - for i in xrange(model.get_nlabels()): - label = model.get_label(i) + for i in xrange(model.dict_nlabels()): + label = model.dict_get_label(i) # Remove the prefix labels.append(label.replace(label_prefix, '')) return SupervisedModel(model, labels, label_prefix) diff --git a/setup.py b/setup.py index 237d0b7..557ce6c 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ def read_version(): 'fasttext/cpp/src/matrix.cc', 'fasttext/cpp/src/model.cc', 'fasttext/cpp/src/utils.cc', + 'fasttext/cpp/src/fasttext.cc', 'fasttext/cpp/src/vector.cc' ], language='c++', From 3b7cfce897111e01139292e1808ca224d5b82651 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 01:06:31 +0700 Subject: [PATCH 085/109] Remove unused code in fasttext/interface.pxd --- fasttext/interface.pxd | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/fasttext/interface.pxd b/fasttext/interface.pxd index 0d1d543..1eddc64 100644 --- a/fasttext/interface.pxd +++ b/fasttext/interface.pxd @@ -21,24 +21,6 @@ cdef extern from "cpp/src/dictionary.h": string getWord(int32_t) string getLabel(int32_t) -# Because Dictionary doesn't have nulary constructor -# We create a class wrapper with __cinit__ and __dealloc__ methods which are -# guaranteed to be called exactly once upon creation and deletion of the Python -# instance. -# http://cython.readthedocs.io/en/latest/src/userguide/wrapping_CPlusPlus.html#create-cython-wrapper-class -# cdef class DictionaryWrapper: -# cdef Dictionary* dictionary_ptr -# -# def __cinit__(self, shared_ptr[Args] args): -# dictionary_ptr = new Dictionary(args) -# -# def __dealloc__(self): -# del self.dictionary_ptr -# -# def get_word(self, i): -# return self.dictioanry_ptr.getWord(i) - - cdef extern from "interface.h": cdef cppclass FastTextModel: FastTextModel() From 71b33795df456d1e186fe5671607173e97674f4b Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 01:07:18 +0700 Subject: [PATCH 086/109] Fix compatibility for classifier model against the fasttext version fabb04e --- fasttext/fasttext.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index b1796af..154ae5e 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -3,7 +3,6 @@ cimport utils from interface cimport trainWrapper from interface cimport loadModelWrapper from interface cimport FastTextModel -from interface cimport Dictionary # Python/C++ standart libraries from libc.stdlib cimport malloc, free @@ -125,6 +124,8 @@ cdef class FastTextModelWrapper: # label_prefix is an optional argument to load the supervised model # prefix will be removed from the label name and stored in the model.labels def load_model(filename, label_prefix=''): + # Initialize log & sigmoid tables + utils.initTables() # Check if the filename is readable if not os.path.isfile(filename): @@ -238,7 +239,7 @@ def cbow(input_file, output, lr=0.05, dim=100, ws=5, epoch=5, min_count=5, # Train classifier def supervised(input_file, output, label_prefix='__label__', lr=0.05, dim=100, ws=5, epoch=5, min_count=1, neg=5, word_ngrams=1, loss='softmax', - bucket=2000000, minn=3, maxn=6, thread=12, lr_update_rate=100, + bucket=0, minn=3, maxn=6, thread=12, lr_update_rate=100, t=1e-4, silent=1): return train_wrapper('supervised', input_file, output, label_prefix, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, From dd4e1513040ed69f7ab92e61b127002d60dd1b0b Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 01:08:00 +0700 Subject: [PATCH 087/109] Update classifier default params value --- README.md | 8 ++++---- README.rst | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 987249c..82a0888 100644 --- a/README.md +++ b/README.md @@ -285,7 +285,7 @@ List of available `params` and their default value: input training file path (required) output output file path (required) label_prefix label prefix ['__label__'] -lr learning rate [0.05] +lr learning rate [0.1] lr_update_rate change the rate of updates for the learning rate [100] dim size of word vectors [100] ws size of the context window [5] @@ -294,9 +294,9 @@ min_count minimal number of word occurences [1] neg number of negatives sampled [5] word_ngrams max length of word ngram [1] loss loss function {ns, hs, softmax} [softmax] -bucket number of buckets [2000000] -minn min length of char ngram [3] -maxn max length of char ngram [6] +bucket number of buckets [0] +minn min length of char ngram [0] +maxn max length of char ngram [0] thread number of threads [12] t sampling threshold [0.0001] silent disable the log output from the C++ extension [1] diff --git a/README.rst b/README.rst index 478e89a..b92755d 100644 --- a/README.rst +++ b/README.rst @@ -300,7 +300,7 @@ List of available ``params`` and their default value: input training file path (required) output output file path (required) label_prefix label prefix ['__label__'] - lr learning rate [0.05] + lr learning rate [0.1] lr_update_rate change the rate of updates for the learning rate [100] dim size of word vectors [100] ws size of the context window [5] @@ -309,9 +309,9 @@ List of available ``params`` and their default value: neg number of negatives sampled [5] word_ngrams max length of word ngram [1] loss loss function {ns, hs, softmax} [softmax] - bucket number of buckets [2000000] - minn min length of char ngram [3] - maxn max length of char ngram [6] + bucket number of buckets [0] + minn min length of char ngram [0] + maxn max length of char ngram [0] thread number of threads [12] t sampling threshold [0.0001] silent disable the log output from the C++ extension [1] From 836fa0152c1c35cbafb2a47a85140dc243ef24c2 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 12:41:35 +0700 Subject: [PATCH 088/109] Update default value of minn & maxn in supervised to prevent Floating point exception --- fasttext/fasttext.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index 154ae5e..cbc30ed 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -237,9 +237,9 @@ def cbow(input_file, output, lr=0.05, dim=100, ws=5, epoch=5, min_count=5, thread, lr_update_rate, t, silent) # Train classifier -def supervised(input_file, output, label_prefix='__label__', lr=0.05, dim=100, +def supervised(input_file, output, label_prefix='__label__', lr=0.1, dim=100, ws=5, epoch=5, min_count=1, neg=5, word_ngrams=1, loss='softmax', - bucket=0, minn=3, maxn=6, thread=12, lr_update_rate=100, + bucket=0, minn=0, maxn=0, thread=12, lr_update_rate=100, t=1e-4, silent=1): return train_wrapper('supervised', input_file, output, label_prefix, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, From 9e2e8287255daadae12091bc1af3e9f830fbc1e4 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 21:57:03 +0700 Subject: [PATCH 089/109] Implement classifier.predict_proba --- fasttext/fasttext.pyx | 17 ++++++++++++++++ fasttext/interface.cc | 46 ++++++++++++++++++++++++++++++++++++++++++ fasttext/interface.h | 2 ++ fasttext/interface.pxd | 1 + fasttext/model.py | 8 ++++++++ 5 files changed, 74 insertions(+) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index cbc30ed..b0322a3 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -64,6 +64,23 @@ cdef class FastTextModelWrapper: labels.append(label) return labels + def classifier_predict_prob(self, text, k, label_prefix): + cdef vector[vector[string]] raw_results + cdef string cpp_str + text_bytes = bytes(text, 'utf-8') + labels = [] + probabilities = [] + raw_results = self.fm.classifierPredictProb(text_bytes, k) + for result in raw_results: + cpp_str = result[0] + label = cpp_str.decode('utf-8') + label = label.replace(label_prefix, '') + cpp_str = result[1] + prob = float(cpp_str) + labels.append(label) + probabilities.append(prob) + return zip(labels, probabilities) + @property def dim(self): return self.fm.dim diff --git a/fasttext/interface.cc b/fasttext/interface.cc index 68c4f93..463e38f 100644 --- a/fasttext/interface.cc +++ b/fasttext/interface.cc @@ -200,7 +200,53 @@ std::vector FastTextModel::classifierPredict(std::string text, } else { return labels; } +} + +std::vector> + FastTextModel::classifierPredictProb(std::string text, int32_t k) +{ + /* Hardcoded here; since we need this variable but the variable + * is private in dictionary.h */ + const int32_t max_line_size = 1024; + + /* List of word ids */ + std::vector text_word_ids; + std::istringstream iss(text); + std::string token; + + /* We implement the same logic as Dictionary::getLine */ + std::uniform_real_distribution<> uniform(0, 1); + while(iss >> token) { + int32_t word_id = _dict->getId(token); + if(word_id < 0) continue; + entry_type type = _dict->getType(word_id); + if (type == entry_type::word && + !_dict->discard(word_id, uniform(_model->rng))) { + text_word_ids.push_back(word_id); + } + if(text_word_ids.size() > max_line_size) break; + } + _dict->addNgrams(text_word_ids, wordNgrams); + std::vector> results; + if(text_word_ids.size() > 0) { + std::vector> predictions; + + _model->predict(text_word_ids, k, predictions); + for(auto it = predictions.cbegin(); it != predictions.cend(); it++) { + std::vector result; + result.push_back(_dict->getLabel(it->second)); + + /* We use string stream here instead of to_string, to make sure + * that the string is consistent with std::cout from fasttext(1) */ + std::ostringstream probability_stream; + probability_stream << exp(it->first); + result.push_back(probability_stream.str()); + + results.push_back(result); + } + } + return results; } void trainWrapper(int argc, char **argv, int silent) diff --git a/fasttext/interface.h b/fasttext/interface.h index 0ab2537..4de7bbe 100644 --- a/fasttext/interface.h +++ b/fasttext/interface.h @@ -40,6 +40,8 @@ class FastTextModel { std::vector getVectorWrapper(std::string word); std::vector classifierTest(std::string filename, int32_t k); std::vector classifierPredict(std::string text, int32_t k); + std::vector> classifierPredictProb(std::string text, + int32_t k); void addWord(std::string word); void setArgs(std::shared_ptr args); diff --git a/fasttext/interface.pxd b/fasttext/interface.pxd index 1eddc64..283e29b 100644 --- a/fasttext/interface.pxd +++ b/fasttext/interface.pxd @@ -42,6 +42,7 @@ cdef extern from "interface.h": vector[real] getVectorWrapper(string word) vector[double] classifierTest(string filename, int32_t k) vector[string] classifierPredict(string text, int32_t k) + vector[vector[string]] classifierPredictProb(string text, int32_t k) # Wrapper for Dictionary class int32_t dictGetNWords() diff --git a/fasttext/model.py b/fasttext/model.py index 6915b53..1ada444 100644 --- a/fasttext/model.py +++ b/fasttext/model.py @@ -68,6 +68,14 @@ def predict(self, texts, k=1): all_labels.append(labels) return all_labels + def predict_proba(self, texts, k=1): + results = [] + for text in texts: + result = self._model.classifier_predict_prob(text, k, + self.label_prefix) + results.append(result) + return results + # Class for test result class ClassifierTestResult(object): def __init__(self, precision, recall, nexamples): From 2d5e3da4cf194a01fd5b7cbdaffa6319f1faecf3 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 21:57:26 +0700 Subject: [PATCH 090/109] Add test for classifier.predict_proba --- Makefile | 12 ++++++++ test/classifier_test.py | 66 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/Makefile b/Makefile index 6983717..e625062 100644 --- a/Makefile +++ b/Makefile @@ -99,6 +99,16 @@ test/classifier_pred_k_result.txt: test/classifier.bin test/classifier_pred_test.txt 5 > \ test/classifier_pred_k_result.txt +test/classifier_pred_prob_result.txt: test/classifier.bin + ./fasttext/cpp/fasttext predict-prob test/classifier.bin \ + test/classifier_pred_test.txt > \ + test/classifier_pred_prob_result.txt + +test/classifier_pred_prob_k_result.txt: test/classifier.bin + ./fasttext/cpp/fasttext predict-prob test/classifier.bin \ + test/classifier_pred_test.txt 5 > \ + test/classifier_pred_prob_k_result.txt + # Generate default value of classifier command from fasttext(1) test/classifier_default_params_result.txt: $(MAKE) classifier_default_params_result.txt --directory test/ @@ -107,6 +117,8 @@ test-classifier: pre-test fasttext/cpp/fasttext test/classifier.bin \ test/classifier_test_result.txt \ test/classifier_pred_result.txt \ test/classifier_pred_k_result.txt \ + test/classifier_pred_prob_result.txt \ + test/classifier_pred_prob_k_result.txt \ test/classifier_default_params_result.txt python test/classifier_test.py --verbose diff --git a/test/classifier_test.py b/test/classifier_test.py index 44a26c5..d520c9c 100644 --- a/test/classifier_test.py +++ b/test/classifier_test.py @@ -19,6 +19,8 @@ test_result = path.join(test_dir, 'classifier_test_result.txt') pred_result = path.join(test_dir, 'classifier_pred_result.txt') pred_k_result = path.join(test_dir, 'classifier_pred_k_result.txt') +pred_prob_result = path.join(test_dir, 'classifier_pred_prob_result.txt') +pred_prob_k_result = path.join(test_dir, 'classifier_pred_prob_k_result.txt') test_file = path.join(test_dir, 'classifier_test.txt') params_txt = path.join(test_dir, 'classifier_default_params_result.txt') @@ -65,6 +67,31 @@ def read_labels_from_result(filename, label_prefix): all_labels.append(labels) return all_labels +def read_labels_from_result_prob(filename, label_prefix): + all_labels = [] + with open(filename, 'r') as f: + for line in f: + try: + line = line.decode('utf-8') + except: + line = line + + labels = [] + probabilities = [] + raw = line.split(' ') + prefix_len = len(label_prefix) + for w in raw: + w = w.strip() + if len(w) < prefix_len: + probabilities.append(float(w)) + elif w[:prefix_len] == label_prefix: + label = w.replace(label_prefix, '') + labels.append(label) + else: + probabilities.append(float(w)) + all_labels.append(zip(labels, probabilities)) + return all_labels + # To read text data to predict def read_texts(pred_file): texts = [] @@ -220,5 +247,44 @@ def test_classifier_predict_k_best(self): # fasttext(1) self.assertTrue(labels == fasttext_labels) + def test_classifier_predict_prob(self): + # Load the pre-trained classifier + label_prefix = '__label__' + classifier = ft.load_model(classifier_bin, label_prefix=label_prefix) + + # Read prediction result from fasttext(1) + fasttext_labels = read_labels_from_result_prob(pred_prob_result, + label_prefix=label_prefix) + + # Read texts from the pred_file + texts = read_texts(pred_file) + + # Predict the labels + labels = classifier.predict_proba(texts) + + # Make sure the returned labels are the same as predicted by + # fasttext(1) + self.assertTrue(labels == fasttext_labels) + + def test_classifier_predict_prob_k_best(self): + label_prefix = '__label__' + # Load the pre-trained classifier + classifier = ft.load_model(classifier_bin, label_prefix=label_prefix) + + # Read prediction result from fasttext(1) + fasttext_labels = read_labels_from_result_prob(pred_prob_k_result, + label_prefix=label_prefix) + + # Read texts from the pred_file + texts = read_texts(pred_file) + + # Predict the k-best labels + labels = classifier.predict_proba(texts, k=5) + + # Make sure the returned labels are the same as predicted by + # fasttext(1) + self.assertTrue(labels == fasttext_labels) + if __name__ == '__main__': unittest.main() + From a58dda910eda2efbe3287ebaadc708cf20412bf9 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 22:02:48 +0700 Subject: [PATCH 091/109] Add classifier.predict_proba documentation --- README.md | 52 ++++++++++++++++++++++++++++++++-------------------- README.rst | 50 +++++++++++++++++++++++++++++++------------------- 2 files changed, 63 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 82a0888..8c64c10 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,10 @@ use `classifer.predict` method: texts = ['example very long text 1', 'example very longtext 2'] labels = classifier.predict(texts) print labels + +# Or with the probability +labels = classifier.predict_proba(texts) +print labels ``` We can specify `k` value to get the k-best labels from classifier: @@ -159,6 +163,10 @@ We can specify `k` value to get the k-best labels from classifier: ```python labels = classifier.predict(texts, k=3) print labels + +# Or with the probability +labels = classifier.predict_proba(texts, k=3) +print labels ``` This interface is equivalent as `fasttext(1)` predict command. The same model @@ -349,6 +357,8 @@ This interface is equivalent as `fasttext(1)` predict command. ```python labels = classifier.predict(texts, k) +# Or with probability +labels = classifier.predict_proba(texts, k) ``` The param `k` is optional, and equal to `1` by default. @@ -358,26 +368,28 @@ The param `k` is optional, and equal to `1` by default. Classifier have the following atributes & methods ```python -classifier.labels # List of labels -classifier.label_prefix # Prefix of the label -classifier.dim # Size of word vector -classifier.ws # Size of context window -classifier.epoch # Number of epochs -classifier.min_count # Minimal number of word occurences -classifier.neg # Number of negative sampled -classifier.word_ngrams # Max length of word ngram -classifier.loss_name # Loss function name -classifier.bucket # Number of buckets -classifier.minn # Min length of char ngram -classifier.maxn # Max length of char ngram -classifier.lr_update_rate # Rate of updates for the learning rate -classifier.t # Value of sampling threshold -classifier.test(filename, k) # Test the classifier -classifier.predict(texts, k) # Predict the most likely label - -``` - -The param `k` for `classifier.test` and `classifier.predict` is optional, +classifier.labels # List of labels +classifier.label_prefix # Prefix of the label +classifier.dim # Size of word vector +classifier.ws # Size of context window +classifier.epoch # Number of epochs +classifier.min_count # Minimal number of word occurences +classifier.neg # Number of negative sampled +classifier.word_ngrams # Max length of word ngram +classifier.loss_name # Loss function name +classifier.bucket # Number of buckets +classifier.minn # Min length of char ngram +classifier.maxn # Max length of char ngram +classifier.lr_update_rate # Rate of updates for the learning rate +classifier.t # Value of sampling threshold +classifier.test(filename, k) # Test the classifier +classifier.predict(texts, k) # Predict the most likely label +classifier.predict_proba(texts, k) # Predict the most likely label include their probability + +``` + +The param `k` for `classifier.test`, `classifier.predict` and +`classifier.predict_proba` is optional, and equal to `1` by default. ## References diff --git a/README.rst b/README.rst index b92755d..9807e4b 100644 --- a/README.rst +++ b/README.rst @@ -164,6 +164,10 @@ In order to obtain the most likely label for a list of text, we can use labels = classifier.predict(texts) print labels + # Or with the probability + labels = classifier.predict_proba(texts) + print labels + We can specify ``k`` value to get the k-best labels from classifier: .. code:: python @@ -171,6 +175,10 @@ We can specify ``k`` value to get the k-best labels from classifier: labels = classifier.predict(texts, k=3) print labels + # Or with the probability + labels = classifier.predict_proba(texts, k=3) + print labels + This interface is equivalent as ``fasttext(1)`` predict command. The same model with the same input set will have the same prediction. @@ -366,6 +374,9 @@ This interface is equivalent as ``fasttext(1)`` predict command. labels = classifier.predict(texts, k) + # Or with probability + labels = classifier.predict_proba(texts, k) + The param ``k`` is optional, and equal to ``1`` by default. Attributes and methods for the classifier @@ -375,25 +386,26 @@ Classifier have the following atributes & methods .. code:: python - classifier.labels # List of labels - classifier.label_prefix # Prefix of the label - classifier.dim # Size of word vector - classifier.ws # Size of context window - classifier.epoch # Number of epochs - classifier.min_count # Minimal number of word occurences - classifier.neg # Number of negative sampled - classifier.word_ngrams # Max length of word ngram - classifier.loss_name # Loss function name - classifier.bucket # Number of buckets - classifier.minn # Min length of char ngram - classifier.maxn # Max length of char ngram - classifier.lr_update_rate # Rate of updates for the learning rate - classifier.t # Value of sampling threshold - classifier.test(filename, k) # Test the classifier - classifier.predict(texts, k) # Predict the most likely label - -The param ``k`` for ``classifier.test`` and ``classifier.predict`` is -optional, and equal to ``1`` by default. + classifier.labels # List of labels + classifier.label_prefix # Prefix of the label + classifier.dim # Size of word vector + classifier.ws # Size of context window + classifier.epoch # Number of epochs + classifier.min_count # Minimal number of word occurences + classifier.neg # Number of negative sampled + classifier.word_ngrams # Max length of word ngram + classifier.loss_name # Loss function name + classifier.bucket # Number of buckets + classifier.minn # Min length of char ngram + classifier.maxn # Max length of char ngram + classifier.lr_update_rate # Rate of updates for the learning rate + classifier.t # Value of sampling threshold + classifier.test(filename, k) # Test the classifier + classifier.predict(texts, k) # Predict the most likely label + classifier.predict_proba(texts, k) # Predict the most likely label include their probability + +The param ``k`` for ``classifier.test``, ``classifier.predict`` and +``classifier.predict_proba`` is optional, and equal to ``1`` by default. References ---------- From 22344dfe40eb93d4fa349f7c2c26c518bfbd064d Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 22:18:35 +0700 Subject: [PATCH 092/109] Improve the classifier.predict performance --- fasttext/fasttext.pyx | 3 ++- fasttext/model.py | 7 ++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index b0322a3..b61c964 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -54,13 +54,14 @@ cdef class FastTextModelWrapper: nexamples = int(result[2]) return CTRes(precision, recall, nexamples) - def classifier_predict(self, text, k): + def classifier_predict(self, text, k, label_prefix): cdef vector[string] raw_labels text_bytes = bytes(text, 'utf-8') labels = [] raw_labels = self.fm.classifierPredict(text_bytes, k) for raw_label in raw_labels: label = raw_label.decode('utf-8') + label = label.replace(label_prefix, '') labels.append(label) return labels diff --git a/fasttext/model.py b/fasttext/model.py index 1ada444..dfff155 100644 --- a/fasttext/model.py +++ b/fasttext/model.py @@ -60,11 +60,8 @@ def test(self, test_file, k=1): def predict(self, texts, k=1): all_labels = [] for text in texts: - labels = [] - raw_labels = self._model.classifier_predict(text, k=k) - for raw_label in raw_labels: - label = raw_label.replace(self.label_prefix, '') - labels.append(label) + labels = self._model.classifier_predict(text, k, + self.label_prefix) all_labels.append(labels) return all_labels From 2876c98138df60cb8e2c1379a2a708d039d4994c Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 22:29:53 +0700 Subject: [PATCH 093/109] Explicitly convert zip() to list() to support Python 3 --- fasttext/fasttext.pyx | 2 +- test/classifier_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index b61c964..4fe1e4a 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -80,7 +80,7 @@ cdef class FastTextModelWrapper: prob = float(cpp_str) labels.append(label) probabilities.append(prob) - return zip(labels, probabilities) + return list(zip(labels, probabilities)) @property def dim(self): diff --git a/test/classifier_test.py b/test/classifier_test.py index d520c9c..a399eda 100644 --- a/test/classifier_test.py +++ b/test/classifier_test.py @@ -89,7 +89,7 @@ def read_labels_from_result_prob(filename, label_prefix): labels.append(label) else: probabilities.append(float(w)) - all_labels.append(zip(labels, probabilities)) + all_labels.append(list(zip(labels, probabilities))) return all_labels # To read text data to predict From 6a2c73f18574c592f9f9e9f4bb3562d7325653bf Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 22:33:59 +0700 Subject: [PATCH 094/109] Update v0.7.2 to v0.7.3 --- fasttext/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fasttext/VERSION b/fasttext/VERSION index 7486fdb..f38fc53 100644 --- a/fasttext/VERSION +++ b/fasttext/VERSION @@ -1 +1 @@ -0.7.2 +0.7.3 From 339e33acd09d30cecb23eba334a68b7c3ed880d4 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 1 Sep 2016 23:57:23 +0700 Subject: [PATCH 095/109] Remove fasttext/VERSION; it cause unexpected behaviour in different env --- fasttext/VERSION | 1 - fasttext/__init__.py | 9 +-------- setup.py | 9 +-------- 3 files changed, 2 insertions(+), 17 deletions(-) delete mode 100644 fasttext/VERSION diff --git a/fasttext/VERSION b/fasttext/VERSION deleted file mode 100644 index f38fc53..0000000 --- a/fasttext/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.7.3 diff --git a/fasttext/__init__.py b/fasttext/__init__.py index 9b965b3..1e4d011 100644 --- a/fasttext/__init__.py +++ b/fasttext/__init__.py @@ -5,11 +5,4 @@ import os -dir_path = os.path.dirname(os.path.realpath(__file__)) -version_path = os.path.join(dir_path, 'VERSION') - -def _read_version(): - with open(version_path) as f: - return f.read().strip() - -__VERSION__ = _read_version() +__VERSION__ = '0.7.5' diff --git a/setup.py b/setup.py index 557ce6c..cd96350 100644 --- a/setup.py +++ b/setup.py @@ -4,13 +4,7 @@ from sys import platform import unittest -# Read the fastText.py version -def read_version(): - with open('fasttext/VERSION') as f: - return f.read().strip() - # Define the C++ extension - if platform == "darwin": extra_compile_args = ['-O3', '-pthread', '-funroll-loops', '-std=c++0x', '-stdlib=libc++', '-mmacosx-version-min=10.7'] else: @@ -36,7 +30,7 @@ def read_version(): # Package details setup( name='fasttext', - version=read_version(), + version='0.7.5', author='Bayu Aldi Yansyah', author_email='bayualdiyansyah@gmail.com', url='https://github.com/pyk/fastText.py', @@ -44,7 +38,6 @@ def read_version(): long_description=open('README.rst', 'r').read(), license='BSD 3-Clause License', packages=['fasttext'], - data_files=[('fasttext', ['fasttext/VERSION'])], ext_modules = cythonize(extensions), install_requires=[ 'numpy>=1', From 77b6d16996ee5bf586ba5d5c44e81c0712057a14 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sat, 3 Sep 2016 00:09:36 +0700 Subject: [PATCH 096/109] Update fastText fabb04e to 602355a --- fasttext/cpp/LAST_COMMIT | 2 +- fasttext/cpp/src/dictionary.cc | 4 ++++ fasttext/cpp/src/fasttext.cc | 1 - fasttext/cpp/src/fasttext.h | 2 ++ update-fasttext.sh | 2 +- 5 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fasttext/cpp/LAST_COMMIT b/fasttext/cpp/LAST_COMMIT index aec85aa..57cb2b3 100644 --- a/fasttext/cpp/LAST_COMMIT +++ b/fasttext/cpp/LAST_COMMIT @@ -1 +1 @@ -fabb04e +602355a diff --git a/fasttext/cpp/src/dictionary.cc b/fasttext/cpp/src/dictionary.cc index 4231375..7759b7f 100644 --- a/fasttext/cpp/src/dictionary.cc +++ b/fasttext/cpp/src/dictionary.cc @@ -185,6 +185,10 @@ void Dictionary::readFromFile(std::istream& in) { initNgrams(); std::cout << "Number of words: " << nwords_ << std::endl; std::cout << "Number of labels: " << nlabels_ << std::endl; + if (size_ == 0) { + std::cerr << "Empty vocabulary. Try a smaller -minCount value." << std::endl; + exit(EXIT_FAILURE); + } } void Dictionary::threshold(int64_t t) { diff --git a/fasttext/cpp/src/fasttext.cc b/fasttext/cpp/src/fasttext.cc index dea7d79..da7fd60 100644 --- a/fasttext/cpp/src/fasttext.cc +++ b/fasttext/cpp/src/fasttext.cc @@ -10,7 +10,6 @@ #include "fasttext.h" #include -#include #include #include diff --git a/fasttext/cpp/src/fasttext.h b/fasttext/cpp/src/fasttext.h index 777a105..91c0042 100644 --- a/fasttext/cpp/src/fasttext.h +++ b/fasttext/cpp/src/fasttext.h @@ -10,6 +10,8 @@ #ifndef FASTTEXT_FASTTEXT_H #define FASTTEXT_FASTTEXT_H +#include + #include #include diff --git a/update-fasttext.sh b/update-fasttext.sh index 7f600c5..ddbed7d 100644 --- a/update-fasttext.sh +++ b/update-fasttext.sh @@ -1,4 +1,4 @@ -NEW_VERSION=fabb04e +NEW_VERSION=602355a CURRENT_VERSION=$(cat fasttext/cpp/LAST_COMMIT) if [ "$NEW_VERSION" = "$CURRENT_VERSION" ]; then From 3ceca304602503e05c65929f8e43bd743b6249f9 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Tue, 6 Sep 2016 11:11:30 +0700 Subject: [PATCH 097/109] Add predict and predict_proba in train classifier test --- test/classifier_test.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/classifier_test.py b/test/classifier_test.py index a399eda..3f56a77 100644 --- a/test/classifier_test.py +++ b/test/classifier_test.py @@ -187,6 +187,12 @@ def test_train_classifier(self): # Make sure .bin and .vec are generated self.assertTrue(path.isfile(output + '.bin')) + # Test some methods, make sure it works + labels = model.predict(['some long long texts']) + self.assertTrue(type(labels) == type([])) + labels = model.predict_proba(['some long long texts']) + self.assertTrue(type(labels) == type([])) + def test_classifier_test(self): # Read the test result from fasttext(1) using the same classifier model precision_at_one = 0.0 From 44bc3666a89f080dffe27dfe0b2080194a82dbee Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Tue, 6 Sep 2016 11:12:06 +0700 Subject: [PATCH 098/109] Unfree the log table to prevent segfaults --- fasttext/fasttext.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index 4fe1e4a..30ff30d 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -194,6 +194,7 @@ def train_wrapper(model_name, input_file, output, label_prefix, lr, dim, ws, raise IOError('fastText: output is not writeable!') # Initialize log & sigmoid tables + # The table is not freed since it used by utils::log globally utils.initTables() # Setup argv, arguments and their values @@ -227,9 +228,6 @@ def train_wrapper(model_name, input_file, output, label_prefix, lr, dim, ws, output_bin = output + '.bin' model = load_model(output_bin, label_prefix) - # Free the log & sigmoid tables from the heap - utils.freeTables() - # Free the allocated memory # The content from PyString_AsString is not deallocated free(c_argv) From 151911766aef1d8b93f47a43f2010a39474cfbc9 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Tue, 6 Sep 2016 11:47:48 +0700 Subject: [PATCH 099/109] Update to v0.7.6 --- fasttext/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fasttext/__init__.py b/fasttext/__init__.py index 1e4d011..3d955a8 100644 --- a/fasttext/__init__.py +++ b/fasttext/__init__.py @@ -5,4 +5,4 @@ import os -__VERSION__ = '0.7.5' +__VERSION__ = '0.7.6' diff --git a/setup.py b/setup.py index cd96350..700ae67 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ # Package details setup( name='fasttext', - version='0.7.5', + version='0.7.6', author='Bayu Aldi Yansyah', author_email='bayualdiyansyah@gmail.com', url='https://github.com/pyk/fastText.py', From fc1de74812f4cd726ee21d3775c5bd71c402eb9d Mon Sep 17 00:00:00 2001 From: Jayant Jain Date: Tue, 6 Sep 2016 15:08:40 +0530 Subject: [PATCH 100/109] improves check word in vocab performance --- fasttext/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fasttext/model.py b/fasttext/model.py index dfff155..9911fd0 100644 --- a/fasttext/model.py +++ b/fasttext/model.py @@ -6,7 +6,7 @@ class WordVectorModel(object): def __init__(self, model, words): self._model = model - self.words = words + self.words = set(words) self.dim = model.dim self.ws = model.ws self.epoch = model.epoch From a1d8201438dc80dc70a093097e22a57458ac8cd4 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Tue, 13 Sep 2016 10:40:37 +0700 Subject: [PATCH 101/109] fastText: update 602355a to d652288 --- fasttext/cpp/LAST_COMMIT | 2 +- fasttext/cpp/README.md | 9 ++++++ fasttext/cpp/src/args.cc | 4 +++ fasttext/cpp/src/args.h | 1 + fasttext/cpp/src/dictionary.cc | 2 +- fasttext/cpp/src/fasttext.cc | 59 ++++++++++++++++++++++++---------- fasttext/cpp/src/fasttext.h | 4 ++- update-fasttext.sh | 2 +- 8 files changed, 62 insertions(+), 21 deletions(-) diff --git a/fasttext/cpp/LAST_COMMIT b/fasttext/cpp/LAST_COMMIT index 57cb2b3..768d485 100644 --- a/fasttext/cpp/LAST_COMMIT +++ b/fasttext/cpp/LAST_COMMIT @@ -1 +1 @@ -602355a +d652288 diff --git a/fasttext/cpp/README.md b/fasttext/cpp/README.md index 1aefd9a..6027416 100644 --- a/fasttext/cpp/README.md +++ b/fasttext/cpp/README.md @@ -109,6 +109,15 @@ The argument `k` is optional, and equal to `1` by default. See `classification-example.sh` for an example use case. In order to reproduce results from the paper [2](#bag-of-tricks-for-efficient-text-classification), run `classification-results.sh`, this will download all the datasets and reproduce the results from Table 1. +If you want to compute vector representations of sentences or paragraphs, please use: + +``` +$ ./fasttext print-vectors model.bin < text.txt +``` + +This assumes that the `text.txt` file contains the paragraphs that you want to get vectors for. +The program will output one vector representation per line in the file. + ## Full documentation Invoke a command without arguments to list available arguments and their default values: diff --git a/fasttext/cpp/src/args.cc b/fasttext/cpp/src/args.cc index 55287d0..9158e26 100644 --- a/fasttext/cpp/src/args.cc +++ b/fasttext/cpp/src/args.cc @@ -31,6 +31,7 @@ Args::Args() { lrUpdateRate = 100; t = 1e-4; label = "__label__"; + verbose = 2; } void Args::parseArgs(int argc, char** argv) { @@ -102,6 +103,8 @@ void Args::parseArgs(int argc, char** argv) { t = atof(argv[ai + 1]); } else if (strcmp(argv[ai], "-label") == 0) { label = std::string(argv[ai + 1]); + } else if (strcmp(argv[ai], "-verbose") == 0) { + verbose = atoi(argv[ai + 1]); } else { std::cout << "Unknown argument: " << argv[ai] << std::endl; printHelp(); @@ -141,6 +144,7 @@ void Args::printHelp() { << " -thread number of threads [" << thread << "]\n" << " -t sampling threshold [" << t << "]\n" << " -label labels prefix [" << label << "]\n" + << " -verbose verbosity level [" << verbose << "]\n" << std::endl; } diff --git a/fasttext/cpp/src/args.h b/fasttext/cpp/src/args.h index 0efe6b7..a9a77cc 100644 --- a/fasttext/cpp/src/args.h +++ b/fasttext/cpp/src/args.h @@ -39,6 +39,7 @@ class Args { int thread; double t; std::string label; + int verbose; void parseArgs(int, char**); void printHelp(); diff --git a/fasttext/cpp/src/dictionary.cc b/fasttext/cpp/src/dictionary.cc index 7759b7f..0934f24 100644 --- a/fasttext/cpp/src/dictionary.cc +++ b/fasttext/cpp/src/dictionary.cc @@ -172,7 +172,7 @@ void Dictionary::readFromFile(std::istream& in) { int64_t minThreshold = 1; while (readWord(in, word)) { add(word); - if (ntokens_ % 1000000 == 0) { + if (ntokens_ % 1000000 == 0 && args_->verbose > 1) { std::cout << "\rRead " << ntokens_ / 1000000 << "M words" << std::flush; } if (size_ > 0.75 * MAX_VOCAB_SIZE) { diff --git a/fasttext/cpp/src/fasttext.cc b/fasttext/cpp/src/fasttext.cc index da7fd60..a830728 100644 --- a/fasttext/cpp/src/fasttext.cc +++ b/fasttext/cpp/src/fasttext.cc @@ -46,15 +46,6 @@ void FastText::saveVectors() { ofs.close(); } -void FastText::printVectors() { - std::string word; - Vector vec(args_->dim); - while (std::cin >> word) { - getVector(vec, word); - std::cout << word << " " << vec << std::endl; - } -} - void FastText::saveModel() { std::ofstream ofs(args_->output + ".bin"); if (!ofs.is_open()) { @@ -208,6 +199,40 @@ void FastText::predict(const std::string& filename, int32_t k, bool print_prob) ifs.close(); } +void FastText::wordVectors() { + std::string word; + Vector vec(args_->dim); + while (std::cin >> word) { + getVector(vec, word); + std::cout << word << " " << vec << std::endl; + } +} + +void FastText::textVectors() { + std::vector line, labels; + Vector vec(args_->dim); + while (std::cin.peek() != EOF) { + dict_->getLine(std::cin, line, labels, model_->rng); + dict_->addNgrams(line, args_->wordNgrams); + vec.zero(); + for (auto it = line.cbegin(); it != line.cend(); ++it) { + vec.addRow(*input_, *it); + } + if (!line.empty()) { + vec.mul(1.0 / line.size()); + } + std::cout << vec << std::endl; + } +} + +void FastText::printVectors() { + if (args_->model == model_name::sup) { + textVectors(); + } else { + wordVectors(); + } +} + void FastText::trainThread(int32_t threadId) { std::ifstream ifs(args_->input); utils::seek(ifs, threadId * utils::size(ifs) / args_->thread); @@ -237,7 +262,7 @@ void FastText::trainThread(int32_t threadId) { if (localTokenCount > args_->lrUpdateRate) { tokenCount += localTokenCount; localTokenCount = 0; - if (threadId == 0) { + if (threadId == 0 && args_->verbose > 1) { printInfo(progress, model.getLoss()); } } @@ -290,13 +315,13 @@ void printUsage() { std::cout << "usage: fasttext \n\n" << "The commands supported by fasttext are:\n\n" - << " supervised train a supervised classifier\n" - << " test evaluate a supervised classifier\n" - << " predict predict most likely labels\n" - << " predict-prob predict most likely labels with probabilities\n" - << " skipgram train a skipgram model\n" - << " cbow train a cbow model\n" - << " print-vectors print vectors given a trained model\n" + << " supervised train a supervised classifier\n" + << " test evaluate a supervised classifier\n" + << " predict predict most likely labels\n" + << " predict-prob predict most likely labels with probabilities\n" + << " skipgram train a skipgram model\n" + << " cbow train a cbow model\n" + << " print-vectors print vectors given a trained model\n" << std::endl; } diff --git a/fasttext/cpp/src/fasttext.h b/fasttext/cpp/src/fasttext.h index 91c0042..bdbaed5 100644 --- a/fasttext/cpp/src/fasttext.h +++ b/fasttext/cpp/src/fasttext.h @@ -36,7 +36,6 @@ class FastText { public: void getVector(Vector&, const std::string&); void saveVectors(); - void printVectors(); void saveModel(); void loadModel(const std::string&); void printInfo(real, real); @@ -47,6 +46,9 @@ class FastText { void skipgram(Model&, real, const std::vector&); void test(const std::string&, int32_t); void predict(const std::string&, int32_t, bool); + void wordVectors(); + void textVectors(); + void printVectors(); void trainThread(int32_t); void train(std::shared_ptr); }; diff --git a/update-fasttext.sh b/update-fasttext.sh index ddbed7d..2cd3522 100644 --- a/update-fasttext.sh +++ b/update-fasttext.sh @@ -1,4 +1,4 @@ -NEW_VERSION=602355a +NEW_VERSION=d652288 CURRENT_VERSION=$(cat fasttext/cpp/LAST_COMMIT) if [ "$NEW_VERSION" = "$CURRENT_VERSION" ]; then From 17b95239bb57eb1ace25a1484a64696fad2466c9 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sat, 24 Sep 2016 23:23:10 +0700 Subject: [PATCH 102/109] fastText: d652288 updated to 16f623f --- fasttext/cpp/LAST_COMMIT | 2 +- fasttext/cpp/README.md | 2 +- fasttext/cpp/eval.py | 9 +++------ fasttext/cpp/src/dictionary.cc | 11 +++++++---- fasttext/cpp/src/fasttext.cc | 4 ++-- fasttext/cpp/word-vector-example.sh | 2 +- update-fasttext.sh | 2 +- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fasttext/cpp/LAST_COMMIT b/fasttext/cpp/LAST_COMMIT index 768d485..a97b59c 100644 --- a/fasttext/cpp/LAST_COMMIT +++ b/fasttext/cpp/LAST_COMMIT @@ -1 +1 @@ -d652288 +16f623f diff --git a/fasttext/cpp/README.md b/fasttext/cpp/README.md index 6027416..83fb24f 100644 --- a/fasttext/cpp/README.md +++ b/fasttext/cpp/README.md @@ -31,7 +31,7 @@ If you do not plan on using the default system-wide compiler, update the two mac ### Building with Docker -If you inted to build with Docker, a Docker file is available here [fastText-Docker](https://github.com/xeb/fastText-docker). +If you intend to build with Docker, a Docker file is available here [fastText-Docker](https://github.com/xeb/fastText-docker). ## Example use cases diff --git a/fasttext/cpp/eval.py b/fasttext/cpp/eval.py index 3a4596e..f946334 100644 --- a/fasttext/cpp/eval.py +++ b/fasttext/cpp/eval.py @@ -21,10 +21,7 @@ import argparse def compat_splitting(line): - if sys.version > "3": - return line.split() - else: # if version is 2 - return line.decode('utf8').split() + return line.decode('utf8').split() def similarity(v1, v2): n1 = np.linalg.norm(v1) @@ -37,7 +34,7 @@ def similarity(v1, v2): args = parser.parse_args() vectors = {} -fin = open(args.modelPath, 'r') +fin = open(args.modelPath, 'rb') for i, line in enumerate(fin): try: tab = compat_splitting(line) @@ -56,7 +53,7 @@ def similarity(v1, v2): drop = 0.0 nwords = 0.0 -fin = open(args.dataPath, 'r') +fin = open(args.dataPath, 'rb') for line in fin: tline = compat_splitting(line) word1 = tline[0].lower() diff --git a/fasttext/cpp/src/dictionary.cc b/fasttext/cpp/src/dictionary.cc index 0934f24..32cbd7e 100644 --- a/fasttext/cpp/src/dictionary.cc +++ b/fasttext/cpp/src/dictionary.cc @@ -147,10 +147,10 @@ void Dictionary::initNgrams() { bool Dictionary::readWord(std::istream& in, std::string& word) { char c; + std::streambuf& sb = *in.rdbuf(); word.clear(); - while (in.peek() != EOF) { - in.get(c); - if (isspace(c) || c == 0) { + while ((c = sb.sbumpc()) != EOF) { + if (c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v' || c == '\f' || c == '\0') { if (word.empty()) { if (c == '\n') { word += EOS; @@ -158,12 +158,15 @@ bool Dictionary::readWord(std::istream& in, std::string& word) } continue; } else { - if (c == '\n') in.unget(); + if (c == '\n') + sb.sungetc(); return true; } } word.push_back(c); } + // trigger eofbit + in.get(); return !word.empty(); } diff --git a/fasttext/cpp/src/fasttext.cc b/fasttext/cpp/src/fasttext.cc index a830728..93b7add 100644 --- a/fasttext/cpp/src/fasttext.cc +++ b/fasttext/cpp/src/fasttext.cc @@ -47,7 +47,7 @@ void FastText::saveVectors() { } void FastText::saveModel() { - std::ofstream ofs(args_->output + ".bin"); + std::ofstream ofs(args_->output + ".bin", std::ofstream::binary); if (!ofs.is_open()) { std::cerr << "Model file cannot be opened for saving!" << std::endl; exit(EXIT_FAILURE); @@ -60,7 +60,7 @@ void FastText::saveModel() { } void FastText::loadModel(const std::string& filename) { - std::ifstream ifs(filename); + std::ifstream ifs(filename, std::ifstream::binary); if (!ifs.is_open()) { std::cerr << "Model file cannot be opened for loading!" << std::endl; exit(EXIT_FAILURE); diff --git a/fasttext/cpp/word-vector-example.sh b/fasttext/cpp/word-vector-example.sh index 2c5296d..397c8ac 100755 --- a/fasttext/cpp/word-vector-example.sh +++ b/fasttext/cpp/word-vector-example.sh @@ -23,7 +23,7 @@ fi if [ ! -f "${DATADIR}/rw/rw.txt" ] then - wget -c http://www-nlp.stanford.edu/~lmthang/morphoNLM/rw.zip -P "${DATADIR}" + wget -c http://stanford.edu/~lmthang/morphoNLM/rw.zip -P "${DATADIR}" unzip "${DATADIR}/rw.zip" -d "${DATADIR}" fi diff --git a/update-fasttext.sh b/update-fasttext.sh index 2cd3522..cfc307b 100644 --- a/update-fasttext.sh +++ b/update-fasttext.sh @@ -1,4 +1,4 @@ -NEW_VERSION=d652288 +NEW_VERSION=16f623f CURRENT_VERSION=$(cat fasttext/cpp/LAST_COMMIT) if [ "$NEW_VERSION" = "$CURRENT_VERSION" ]; then From df22043b2a0795dafb986f555f42f89eb57a2ce4 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Sun, 25 Sep 2016 23:16:15 +0700 Subject: [PATCH 103/109] Update the dbpedia dataset's url --- test/download_dbpedia.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/download_dbpedia.sh b/test/download_dbpedia.sh index adb7fa1..7a9e296 100644 --- a/test/download_dbpedia.sh +++ b/test/download_dbpedia.sh @@ -17,7 +17,7 @@ normalize_text() { } echo "Downloading the dbpedia_csv.tar.gz ..." -wget -c "https://googledrive.com/host/0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k" \ +wget -c "https://github.com/pyk/dbpedia_csv/raw/master/dbpedia_csv.tar.gz" \ -O test/dbpedia_csv.tar.gz echo "Extract dbpedia_csv.tar.gz to test/" From 632dd4542824b96cfb6ec2aaef3f65d35021ea43 Mon Sep 17 00:00:00 2001 From: Travis <381404825@qq.com> Date: Wed, 28 Sep 2016 18:02:15 +0800 Subject: [PATCH 104/109] update param name 'input' to 'input_file' in doc param name for fasttext.cbow(), fasttext.skipgram() and fasttext.supervised() --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8c64c10..b67ddf9 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,7 @@ model = fasttext.skipgram(params) List of available `params` and their default value: ``` -input training file path (required) +input_file training file path (required) output output file path (required) lr learning rate [0.05] lr_update_rate change the rate of updates for the learning rate [100] @@ -222,7 +222,7 @@ model = fasttext.cbow(params) List of available `params` and their default value: ``` -input training file path (required) +input_file training file path (required) output output file path (required) lr learning rate [0.05] lr_update_rate change the rate of updates for the learning rate [100] @@ -290,7 +290,7 @@ classifier = fasttext.supervised(params) List of available `params` and their default value: ``` -input training file path (required) +input_file training file path (required) output output file path (required) label_prefix label prefix ['__label__'] lr learning rate [0.1] From 99131db828879accbca4ce018af7193483271b6d Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 29 Sep 2016 23:01:13 +0700 Subject: [PATCH 105/109] Add test for encoding param --- test/cbow_test.py | 2 +- test/classifier_test.py | 3 ++- test/skipgram_test.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/test/cbow_test.py b/test/cbow_test.py index 8838829..26938b6 100644 --- a/test/cbow_test.py +++ b/test/cbow_test.py @@ -20,7 +20,7 @@ # Test to make sure that cbow interface run correctly class TestCBOWModel(unittest.TestCase): def test_load_cbow_model(self): - model = ft.load_model(cbow_file) + model = ft.load_model(cbow_file, encoding='utf-8') # Make sure the model is returned correctly self.assertEqual(model.model_name, 'cbow') diff --git a/test/classifier_test.py b/test/classifier_test.py index 3f56a77..de6abc4 100644 --- a/test/classifier_test.py +++ b/test/classifier_test.py @@ -108,7 +108,8 @@ def read_texts(pred_file): class TestClassifierModel(unittest.TestCase): def test_load_classifier_model(self): label_prefix='__label__' - model = ft.load_model(classifier_bin, label_prefix=label_prefix) + model = ft.load_model(classifier_bin, label_prefix=label_prefix, + encoding='utf-8') # Make sure the model is returned correctly self.assertEqual(model.model_name, 'supervised') diff --git a/test/skipgram_test.py b/test/skipgram_test.py index 1d53335..6a17312 100644 --- a/test/skipgram_test.py +++ b/test/skipgram_test.py @@ -20,7 +20,7 @@ # Test to make sure that skipgram interface run correctly class TestSkipgramModel(unittest.TestCase): def test_load_skipgram_model(self): - model = ft.load_model(skipgram_file) + model = ft.load_model(skipgram_file, encoding='utf-8') # Make sure the model is returned correctly self.assertEqual(model.model_name, 'skipgram') From e5020b7e025656b09e96f0d2412ff2da184cf434 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 29 Sep 2016 23:02:56 +0700 Subject: [PATCH 106/109] Support non-utf8 encoding --- fasttext/fasttext.pyx | 46 +++++++++++++++++++++++++------------------ fasttext/model.py | 10 ++++++---- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index 30ff30d..a3bce94 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -19,10 +19,14 @@ from builtins import bytes # This class wrap C++ class FastTextModel, so it can be accessed via Python cdef class FastTextModelWrapper: cdef FastTextModel fm + cdef string encoding def __cinit__(self): self.fm = FastTextModel() + def __init__(self, encoding='utf-8'): + self.encoding = encoding + # dict_* methods is a wrapper for the Dictionary class methods; # We can't access dicrectly Dictionary in python because # Dictionary class doesn't have a nullary constructor @@ -32,7 +36,7 @@ cdef class FastTextModelWrapper: def dict_get_word(self, i): cdef string cpp_string cpp_string = self.fm.dictGetWord(i) - return cpp_string.decode('utf-8') + return cpp_string.decode(self.encoding) def dict_nlabels(self): return self.fm.dictGetNLabels() @@ -40,14 +44,14 @@ cdef class FastTextModelWrapper: def dict_get_label(self, i): cdef string cpp_string cpp_string = self.fm.dictGetLabel(i) - return cpp_string.decode('utf-8') + return cpp_string.decode(self.encoding) def get_vector(self, word): - word_bytes = bytes(word, 'utf-8') + word_bytes = bytes(word, self.encoding) return self.fm.getVectorWrapper(word_bytes) def classifier_test(self, test_file, k): - test_file = bytes(test_file, 'utf-8') + test_file = bytes(test_file, self.encoding) result = self.fm.classifierTest(test_file, k) precision = float(result[0]) recall = float(result[1]) @@ -56,11 +60,11 @@ cdef class FastTextModelWrapper: def classifier_predict(self, text, k, label_prefix): cdef vector[string] raw_labels - text_bytes = bytes(text, 'utf-8') + text_bytes = bytes(text, self.encoding) labels = [] raw_labels = self.fm.classifierPredict(text_bytes, k) for raw_label in raw_labels: - label = raw_label.decode('utf-8') + label = raw_label.decode(self.encoding) label = label.replace(label_prefix, '') labels.append(label) return labels @@ -68,13 +72,13 @@ cdef class FastTextModelWrapper: def classifier_predict_prob(self, text, k, label_prefix): cdef vector[vector[string]] raw_results cdef string cpp_str - text_bytes = bytes(text, 'utf-8') + text_bytes = bytes(text, self.encoding) labels = [] probabilities = [] raw_results = self.fm.classifierPredictProb(text_bytes, k) for result in raw_results: cpp_str = result[0] - label = cpp_str.decode('utf-8') + label = cpp_str.decode(self.encoding) label = label.replace(label_prefix, '') cpp_str = result[1] prob = float(cpp_str) @@ -138,10 +142,14 @@ cdef class FastTextModelWrapper: def t(self): return self.fm.t + @property + def encoding(self): + return self.encoding + # Load .bin file that generated by fastText # label_prefix is an optional argument to load the supervised model # prefix will be removed from the label name and stored in the model.labels -def load_model(filename, label_prefix=''): +def load_model(filename, label_prefix='', encoding='utf-8'): # Initialize log & sigmoid tables utils.initTables() @@ -149,8 +157,8 @@ def load_model(filename, label_prefix=''): if not os.path.isfile(filename): raise ValueError('fastText: trained model cannot be opened!') - model = FastTextModelWrapper() - filename_bytes = bytes(filename, 'utf-8') + model = FastTextModelWrapper(encoding=encoding) + filename_bytes = bytes(filename, encoding) try: # How we load the dictionary loadModelWrapper(filename_bytes, model.fm) @@ -179,7 +187,7 @@ def load_model(filename, label_prefix=''): # Wrapper for train(int argc, char *argv) C++ function in cpp/src/fasttext.cc def train_wrapper(model_name, input_file, output, label_prefix, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, thread, - lr_update_rate, t, silent=1): + lr_update_rate, t, silent=1, encoding='utf-8'): # Check if the input_file is valid if not os.path.isfile(input_file): @@ -226,7 +234,7 @@ def train_wrapper(model_name, input_file, output, label_prefix, lr, dim, ws, # Load the model output_bin = output + '.bin' - model = load_model(output_bin, label_prefix) + model = load_model(output_bin, label_prefix, encoding=encoding) # Free the allocated memory # The content from PyString_AsString is not deallocated @@ -237,26 +245,26 @@ def train_wrapper(model_name, input_file, output, label_prefix, lr, dim, ws, # Learn word representation using skipgram model def skipgram(input_file, output, lr=0.05, dim=100, ws=5, epoch=5, min_count=5, neg=5, word_ngrams=1, loss='ns', bucket=2000000, minn=3, maxn=6, - thread=12, lr_update_rate=100, t=1e-4, silent=1): + thread=12, lr_update_rate=100, t=1e-4, silent=1, encoding='utf-8'): label_prefix = '' return train_wrapper('skipgram', input_file, output, label_prefix, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, - maxn, thread, lr_update_rate, t, silent) + maxn, thread, lr_update_rate, t, silent, encoding='utf-8') # Learn word representation using CBOW model def cbow(input_file, output, lr=0.05, dim=100, ws=5, epoch=5, min_count=5, neg=5, word_ngrams=1, loss='ns', bucket=2000000, minn=3, maxn=6, - thread=12, lr_update_rate=100, t=1e-4, silent=1): + thread=12, lr_update_rate=100, t=1e-4, silent=1, encoding='utf-8'): label_prefix = '' return train_wrapper('cbow', input_file, output, label_prefix, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, maxn, - thread, lr_update_rate, t, silent) + thread, lr_update_rate, t, silent, encoding) # Train classifier def supervised(input_file, output, label_prefix='__label__', lr=0.1, dim=100, ws=5, epoch=5, min_count=1, neg=5, word_ngrams=1, loss='softmax', bucket=0, minn=0, maxn=0, thread=12, lr_update_rate=100, - t=1e-4, silent=1): + t=1e-4, silent=1, encoding='utf-8'): return train_wrapper('supervised', input_file, output, label_prefix, lr, dim, ws, epoch, min_count, neg, word_ngrams, loss, bucket, minn, - maxn, thread, lr_update_rate, t, silent) + maxn, thread, lr_update_rate, t, silent, encoding) diff --git a/fasttext/model.py b/fasttext/model.py index 9911fd0..bdaa6ec 100644 --- a/fasttext/model.py +++ b/fasttext/model.py @@ -13,13 +13,14 @@ def __init__(self, model, words): self.min_count = model.minCount self.neg = model.neg self.word_ngrams = model.wordNgrams - self.loss_name = model.lossName.decode('utf-8') - self.model_name = model.modelName.decode('utf-8') + self.loss_name = model.lossName.decode(model.encoding) + self.model_name = model.modelName.decode(model.encoding) self.bucket = model.bucket self.minn = model.minn self.maxn = model.maxn self.lr_update_rate = model.lrUpdateRate self.t = model.t + self.encoding = model.encoding def __getitem__(self, word): return self._model.get_vector(word) @@ -45,14 +46,15 @@ def __init__(self, model, labels, label_prefix): self.min_count = model.minCount self.neg = model.neg self.word_ngrams = model.wordNgrams - self.loss_name = model.lossName.decode('utf-8') - self.model_name = model.modelName.decode('utf-8') + self.loss_name = model.lossName.decode(model.encoding) + self.model_name = model.modelName.decode(model.encoding) self.bucket = model.bucket self.minn = model.minn self.maxn = model.maxn self.lr_update_rate = model.lrUpdateRate self.t = model.t self.label_prefix = label_prefix + self.encoding = model.encoding def test(self, test_file, k=1): return self._model.classifier_test(test_file, k) From 6dd96fd62b4dfaac2d3b288d92cf15c19019d270 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Thu, 29 Sep 2016 23:03:48 +0700 Subject: [PATCH 107/109] Update README.* --- README.md | 7 ++++++- README.rst | 13 +++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index b67ddf9..a78f481 100644 --- a/README.md +++ b/README.md @@ -202,6 +202,7 @@ maxn max length of char ngram [6] thread number of threads [12] t sampling threshold [0.0001] silent disable the log output from the C++ extension [1] +encoding specify input_file encoding [utf-8] ``` @@ -239,6 +240,7 @@ maxn max length of char ngram [6] thread number of threads [12] t sampling threshold [0.0001] silent disable the log output from the C++ extension [1] +encoding specify input_file encoding [utf-8] ``` @@ -254,7 +256,7 @@ File `.bin` that previously trained or generated by fastText can be loaded using this function ```python -model = fasttext.load_model('model.bin') +model = fasttext.load_model('model.bin', encoding='utf-8') ``` ### Attributes and methods for the model @@ -276,6 +278,7 @@ model.minn # Min length of char ngram model.maxn # Max length of char ngram model.lr_update_rate # Rate of updates for the learning rate model.t # Value of sampling threshold +model.encoding # Encoding of the model model[word] # Get the vector of specified word ``` @@ -308,6 +311,7 @@ maxn max length of char ngram [0] thread number of threads [12] t sampling threshold [0.0001] silent disable the log output from the C++ extension [1] +encoding specify input_file encoding [utf-8] ``` @@ -382,6 +386,7 @@ classifier.minn # Min length of char ngram classifier.maxn # Max length of char ngram classifier.lr_update_rate # Rate of updates for the learning rate classifier.t # Value of sampling threshold +classifier.encoding # Encoding that used by classifier classifier.test(filename, k) # Test the classifier classifier.predict(texts, k) # Predict the most likely label classifier.predict_proba(texts, k) # Predict the most likely label include their probability diff --git a/README.rst b/README.rst index 9807e4b..6a3cabf 100644 --- a/README.rst +++ b/README.rst @@ -198,7 +198,7 @@ List of available ``params`` and their default value: :: - input training file path (required) + input_file training file path (required) output output file path (required) lr learning rate [0.05] lr_update_rate change the rate of updates for the learning rate [100] @@ -215,6 +215,7 @@ List of available ``params`` and their default value: thread number of threads [12] t sampling threshold [0.0001] silent disable the log output from the C++ extension [1] + encoding specify input_file encoding [utf-8] Example usage: @@ -235,7 +236,7 @@ List of available ``params`` and their default value: :: - input training file path (required) + input_file training file path (required) output output file path (required) lr learning rate [0.05] lr_update_rate change the rate of updates for the learning rate [100] @@ -252,6 +253,7 @@ List of available ``params`` and their default value: thread number of threads [12] t sampling threshold [0.0001] silent disable the log output from the C++ extension [1] + encoding specify input_file encoding [utf-8] Example usage: @@ -267,7 +269,7 @@ loaded using this function .. code:: python - model = fasttext.load_model('model.bin') + model = fasttext.load_model('model.bin', encoding='utf-8') Attributes and methods for the model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -290,6 +292,7 @@ Skipgram and CBOW model have the following atributes & methods model.maxn # Max length of char ngram model.lr_update_rate # Rate of updates for the learning rate model.t # Value of sampling threshold + model.encoding # Encoding of the model model[word] # Get the vector of specified word Supervised model @@ -305,7 +308,7 @@ List of available ``params`` and their default value: :: - input training file path (required) + input_file training file path (required) output output file path (required) label_prefix label prefix ['__label__'] lr learning rate [0.1] @@ -323,6 +326,7 @@ List of available ``params`` and their default value: thread number of threads [12] t sampling threshold [0.0001] silent disable the log output from the C++ extension [1] + encoding specify input_file encoding [utf-8] Example usage: @@ -400,6 +404,7 @@ Classifier have the following atributes & methods classifier.maxn # Max length of char ngram classifier.lr_update_rate # Rate of updates for the learning rate classifier.t # Value of sampling threshold + classifier.encoding # Encoding that used by classifier classifier.test(filename, k) # Test the classifier classifier.predict(texts, k) # Predict the most likely label classifier.predict_proba(texts, k) # Predict the most likely label include their probability From 7e789e6789f2e38885d843d01e6dd413228a9539 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Fri, 30 Sep 2016 21:18:38 +0700 Subject: [PATCH 108/109] Support non-utf8 for Python 3 --- fasttext/fasttext.pyx | 46 ++++++++++++++++++------------------------- fasttext/model.py | 24 +++++++++++----------- 2 files changed, 31 insertions(+), 39 deletions(-) diff --git a/fasttext/fasttext.pyx b/fasttext/fasttext.pyx index a3bce94..01e6235 100644 --- a/fasttext/fasttext.pyx +++ b/fasttext/fasttext.pyx @@ -19,66 +19,62 @@ from builtins import bytes # This class wrap C++ class FastTextModel, so it can be accessed via Python cdef class FastTextModelWrapper: cdef FastTextModel fm - cdef string encoding def __cinit__(self): self.fm = FastTextModel() - def __init__(self, encoding='utf-8'): - self.encoding = encoding - # dict_* methods is a wrapper for the Dictionary class methods; # We can't access dicrectly Dictionary in python because # Dictionary class doesn't have a nullary constructor def dict_nwords(self): return self.fm.dictGetNWords() - def dict_get_word(self, i): + def dict_get_word(self, i, encoding): cdef string cpp_string cpp_string = self.fm.dictGetWord(i) - return cpp_string.decode(self.encoding) + return cpp_string.decode(encoding) def dict_nlabels(self): return self.fm.dictGetNLabels() - def dict_get_label(self, i): + def dict_get_label(self, i, encoding): cdef string cpp_string cpp_string = self.fm.dictGetLabel(i) - return cpp_string.decode(self.encoding) + return cpp_string.decode(encoding) - def get_vector(self, word): - word_bytes = bytes(word, self.encoding) + def get_vector(self, word, encoding): + word_bytes = bytes(word, encoding) return self.fm.getVectorWrapper(word_bytes) - def classifier_test(self, test_file, k): - test_file = bytes(test_file, self.encoding) + def classifier_test(self, test_file, k, encoding): + test_file = bytes(test_file, encoding) result = self.fm.classifierTest(test_file, k) precision = float(result[0]) recall = float(result[1]) nexamples = int(result[2]) return CTRes(precision, recall, nexamples) - def classifier_predict(self, text, k, label_prefix): + def classifier_predict(self, text, k, label_prefix, encoding): cdef vector[string] raw_labels - text_bytes = bytes(text, self.encoding) + text_bytes = bytes(text, encoding) labels = [] raw_labels = self.fm.classifierPredict(text_bytes, k) for raw_label in raw_labels: - label = raw_label.decode(self.encoding) + label = raw_label.decode(encoding) label = label.replace(label_prefix, '') labels.append(label) return labels - def classifier_predict_prob(self, text, k, label_prefix): + def classifier_predict_prob(self, text, k, label_prefix, encoding): cdef vector[vector[string]] raw_results cdef string cpp_str - text_bytes = bytes(text, self.encoding) + text_bytes = bytes(text, encoding) labels = [] probabilities = [] raw_results = self.fm.classifierPredictProb(text_bytes, k) for result in raw_results: cpp_str = result[0] - label = cpp_str.decode(self.encoding) + label = cpp_str.decode(encoding) label = label.replace(label_prefix, '') cpp_str = result[1] prob = float(cpp_str) @@ -142,10 +138,6 @@ cdef class FastTextModelWrapper: def t(self): return self.fm.t - @property - def encoding(self): - return self.encoding - # Load .bin file that generated by fastText # label_prefix is an optional argument to load the supervised model # prefix will be removed from the label name and stored in the model.labels @@ -157,7 +149,7 @@ def load_model(filename, label_prefix='', encoding='utf-8'): if not os.path.isfile(filename): raise ValueError('fastText: trained model cannot be opened!') - model = FastTextModelWrapper(encoding=encoding) + model = FastTextModelWrapper() filename_bytes = bytes(filename, encoding) try: # How we load the dictionary @@ -171,16 +163,16 @@ def load_model(filename, label_prefix='', encoding='utf-8'): words = [] # We build the dictionary here to support unicode characters for i in xrange(model.dict_nwords()): - word = model.dict_get_word(i) + word = model.dict_get_word(i, encoding) words.append(word) - return WordVectorModel(model, words) + return WordVectorModel(model, words, encoding) elif model_name == 'supervised': labels = [] for i in xrange(model.dict_nlabels()): - label = model.dict_get_label(i) + label = model.dict_get_label(i, encoding) # Remove the prefix labels.append(label.replace(label_prefix, '')) - return SupervisedModel(model, labels, label_prefix) + return SupervisedModel(model, labels, label_prefix, encoding) else: raise ValueError('fastText: model name is not valid!') diff --git a/fasttext/model.py b/fasttext/model.py index bdaa6ec..954bf34 100644 --- a/fasttext/model.py +++ b/fasttext/model.py @@ -4,7 +4,7 @@ # Class for Skipgram and CBOW model class WordVectorModel(object): - def __init__(self, model, words): + def __init__(self, model, words, encoding='utf-8'): self._model = model self.words = set(words) self.dim = model.dim @@ -13,17 +13,17 @@ def __init__(self, model, words): self.min_count = model.minCount self.neg = model.neg self.word_ngrams = model.wordNgrams - self.loss_name = model.lossName.decode(model.encoding) - self.model_name = model.modelName.decode(model.encoding) + self.loss_name = model.lossName.decode(encoding) + self.model_name = model.modelName.decode(encoding) self.bucket = model.bucket self.minn = model.minn self.maxn = model.maxn self.lr_update_rate = model.lrUpdateRate self.t = model.t - self.encoding = model.encoding + self.encoding = encoding def __getitem__(self, word): - return self._model.get_vector(word) + return self._model.get_vector(word, self.encoding) def __contains__(self, word): return word in self.words @@ -37,7 +37,7 @@ def cosine_similarity(self, first_word, second_word): # Class for classifier model class SupervisedModel(object): - def __init__(self, model, labels, label_prefix): + def __init__(self, model, labels, label_prefix, encoding='utf-8'): self._model = model self.labels = labels self.dim = model.dim @@ -46,24 +46,24 @@ def __init__(self, model, labels, label_prefix): self.min_count = model.minCount self.neg = model.neg self.word_ngrams = model.wordNgrams - self.loss_name = model.lossName.decode(model.encoding) - self.model_name = model.modelName.decode(model.encoding) + self.loss_name = model.lossName.decode(encoding) + self.model_name = model.modelName.decode(encoding) self.bucket = model.bucket self.minn = model.minn self.maxn = model.maxn self.lr_update_rate = model.lrUpdateRate self.t = model.t self.label_prefix = label_prefix - self.encoding = model.encoding + self.encoding = encoding def test(self, test_file, k=1): - return self._model.classifier_test(test_file, k) + return self._model.classifier_test(test_file, k, self.encoding) def predict(self, texts, k=1): all_labels = [] for text in texts: labels = self._model.classifier_predict(text, k, - self.label_prefix) + self.label_prefix, self.encoding) all_labels.append(labels) return all_labels @@ -71,7 +71,7 @@ def predict_proba(self, texts, k=1): results = [] for text in texts: result = self._model.classifier_predict_prob(text, k, - self.label_prefix) + self.label_prefix, self.encoding) results.append(result) return results From cde9ec34e5e6ff5d4d51c127a642040e70f681a4 Mon Sep 17 00:00:00 2001 From: Bayu Aldi Yansyah Date: Wed, 5 Oct 2016 15:35:58 +0700 Subject: [PATCH 109/109] Release v0.8.0 --- fasttext/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fasttext/__init__.py b/fasttext/__init__.py index 3d955a8..842f8bf 100644 --- a/fasttext/__init__.py +++ b/fasttext/__init__.py @@ -5,4 +5,4 @@ import os -__VERSION__ = '0.7.6' +__VERSION__ = '0.8.0' diff --git a/setup.py b/setup.py index 700ae67..de0accf 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ # Package details setup( name='fasttext', - version='0.7.6', + version='0.8.0', author='Bayu Aldi Yansyah', author_email='bayualdiyansyah@gmail.com', url='https://github.com/pyk/fastText.py',