diff --git a/Topic_modelling_3.ipynb b/Topic_modelling_3.ipynb new file mode 100644 index 0000000..649f136 --- /dev/null +++ b/Topic_modelling_3.ipynb @@ -0,0 +1,2081 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import itertools\n", + "import pickle\n", + "from pprint import pprint\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk\n", + "from nltk.stem import WordNetLemmatizer, SnowballStemmer\n", + "from nltk.stem.porter import *\n", + "from nltk.corpus import wordnet\n", + "from nltk import FreqDist\n", + "from nltk.corpus import stopwords\n", + "\n", + "import spacy\n", + "\n", + "import gensim\n", + "from gensim.utils import simple_preprocess\n", + "from gensim.parsing.preprocessing import STOPWORDS\n", + "from gensim.models.ldamodel import LdaModel\n", + "from gensim.models import CoherenceModel\n", + "import gensim.corpora as corpora\n", + "\n", + "import pyLDAvis.gensim\n", + "np.random.seed(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = pd.read_excel('Rubber News.xlsx', error_bad_lines=False);" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(5275, 6)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "data = data[[\"Full.story\"]]\n", + "data= data.apply(lambda x: x.astype(str).str.lower())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5275, 1)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# data.rename(columns = {'Full.story':'full_story'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "documents = data.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Full.story
0sumitomo rubber industries has established a n...
1spot rubber closed unchanged on thursday. rss ...
2delegate registration for india rubber meet 20...
3mumbai – futures contracts of rubber on the in...
4tapping has been delayed despite the fact that...
......
5270cogencis, wednesday, apr 11 new delhi – india’...
5271cogencis, tuesday, apr 10 by prabhnoor nanda n...
5272outlook futures contracts of natural rubber ma...
5273cogencis, tuesday, apr 10 by shikha singh new ...
5274cogencis, monday, apr 9 by prabhnoor nanda new...
\n", + "

5275 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " Full.story\n", + "0 sumitomo rubber industries has established a n...\n", + "1 spot rubber closed unchanged on thursday. rss ...\n", + "2 delegate registration for india rubber meet 20...\n", + "3 mumbai – futures contracts of rubber on the in...\n", + "4 tapping has been delayed despite the fact that...\n", + "... ...\n", + "5270 cogencis, wednesday, apr 11 new delhi – india’...\n", + "5271 cogencis, tuesday, apr 10 by prabhnoor nanda n...\n", + "5272 outlook futures contracts of natural rubber ma...\n", + "5273 cogencis, tuesday, apr 10 by shikha singh new ...\n", + "5274 cogencis, monday, apr 9 by prabhnoor nanda new...\n", + "\n", + "[5275 rows x 1 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'cogencis, monday, jul 30 –source: india jun natural rubber use 102,000 tn, up 17.7% on year –india jun natural rubber imports 38,367 tn, up 14.7% on year –india natural rubber june-end stocks 232,000 tn by mugunthan kesavan new delhi – india’s natural rubber production fell 2.2% on year to 44,000 tn in june while consumption rose 17.7% to 102,000 tn, a rubber board source said. heavy rains in kerala, the top producer of rubber, have hit tapping in the last two months. with consumption shooting up and output declining, imports rose 14.7% on year to 38,367 tn in june, the source said. for apr-jun, natural rubber output was down 11.9% on year at 126,000 tn, while consumption was 14.2% higher at 302,000 tn. during the same period, imports rose 24.6% on year to 118,355 tn, the source said. india’s natural rubber inventory at the end of june was 232,000 tn, compared with 225,000 tn a year ago, the source said. the tyre industry has been demanding removal of port restrictions on natural rubber for all imports. in june, the government allowed imports of rubber meant for re-export under advance authorisation scheme, from any port in india. under the scheme, a product is imported duty-free with a re-export obligation after value addition. in 2016, the government restricted imports of natural rubber for re-export to only chennai and nhava sheva (jawaharlal nehru port) ports. the curbs are aimed at supporting domestic prices of rubber which have fallen by more than 100 rupees per kg in the last few years due to cheaper availability in global markets. tyre manufacturers are against the restrictions as it delays deliveries and increases transportation cost. endedited by akshit harsh cogencis tel +91 (11) 4220-1000 send comments to feedback@cogencis.com'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents[documents.index == 5027].values[0][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "<>:1: DeprecationWarning: invalid escape sequence \\.\n", + "<>:1: DeprecationWarning: invalid escape sequence \\.\n", + "<>:1: DeprecationWarning: invalid escape sequence \\.\n", + ":1: DeprecationWarning: invalid escape sequence \\.\n", + " documents['full_story'] = documents['Full.story'].map(lambda x: re.sub('[,\\.!?]', '', x))\n" + ] + } + ], + "source": [ + "documents['full_story'] = documents['Full.story'].map(lambda x: re.sub('[,\\.!?]', '', x))\n", + "documents['full_story'] = documents['full_story'].str.replace(\"[^a-zA-Z#]\", \" \")\n", + "documents['full_story']= documents['full_story'].str.replace(' ', ' ')\n", + "documents['full_story']= documents['full_story'].str.replace(' ', ' ')\n", + "documents['full_story']= documents['full_story'].str.replace(' ', ' ')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'cogencis monday jul source india jun natural rubber use tn up on year india jun natural rubber imports tn up on year india natural rubber june end stocks tn by mugunthan kesavan new delhi india s natural rubber production fell on year to tn in june while consumption rose to tn a rubber board source said heavy rains in kerala the top producer of rubber have hit tapping in the last two months with consumption shooting up and output declining imports rose on year to tn in june the source said for apr jun natural rubber output was down on year at tn while consumption was higher at tn during the same period imports rose on year to tn the source said india s natural rubber inventory at the end of june was tn compared with tn a year ago the source said the tyre industry has been demanding removal of port restrictions on natural rubber for all imports in june the government allowed imports of rubber meant for re export under advance authorisation scheme from any port in india under the scheme a product is imported duty free with a re export obligation after value addition in the government restricted imports of natural rubber for re export to only chennai and nhava sheva jawaharlal nehru port ports the curbs are aimed at supporting domestic prices of rubber which have fallen by more than rupees per kg in the last few years due to cheaper availability in global markets tyre manufacturers are against the restrictions as it delays deliveries and increases transportation cost endedited by akshit harsh cogencis tel send comments to feedback cogenciscom'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents[documents.index == 5027].values[0][1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EDA" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "from wordcloud import WordCloud" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Join the different processed titles together.\n", + "long_string = ','.join(list(documents['full_story'].values))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wordcloud = WordCloud(background_color=\"white\", max_words=5000, contour_width=8, contour_color='steelblue')\n", + "wordcloud.generate(long_string)\n", + "wordcloud.to_image()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def freq_words(x, terms = 30):\n", + " all_words = ' '.join([text for text in x])\n", + " all_words = all_words.split()\n", + "\n", + " fdist = FreqDist(all_words)\n", + " words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})\n", + "\n", + " d = words_df.nlargest(columns=\"count\", n = terms) \n", + " plt.figure(figsize=(20,5))\n", + " ax = sns.barplot(data=d, x= \"word\", y = \"count\")\n", + " ax.set(ylabel = 'Count')\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "freq_words(documents['full_story'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data processing" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def sent_to_words(sentences):\n", + " for sentence in sentences:\n", + " yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations\n", + "\n", + "data = documents.full_story.values.tolist()\n", + "data_words = list(sent_to_words(data))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['sumitomo', 'rubber', 'industries', 'has', 'established', 'natural', 'rubber', 'procurement', 'subsidiary', 'in', 'singapore', 'the', 'world', 'largest', 'natural', 'rubber', 'trading', 'hub']\n" + ] + } + ], + "source": [ + "print(data_words[:1][0][:30])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating Bi-Gram and Tri-Gram" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.\n", + "trigram = gensim.models.Phrases(bigram[data_words], threshold=100) \n", + "\n", + "bigram_mod = gensim.models.phrases.Phraser(bigram)\n", + "trigram_mod = gensim.models.phrases.Phraser(trigram)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "my_stop_words = STOPWORDS.union(set(['send', 'comment', 'feedback', 'today', 'come', 'harsh', 'akshit', 'nishant', 'chakraborty', \n", + " 'augustine', 'cogencis', 'com', 'rahul', 'dhuri', 'end-users','rupeesedited', 'mugunthan', 'kesavan', \n", + " 'says', 'said', 'say', 'prabhnoor', 'nanda',\"show\",\"table\",\"detail\",\"give\",\"geojit\",\n", + " 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'january', 'february',\n", + " 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december' ]))\n", + "\n", + "\n", + "# ,'kerla','delhi','kottayam','india','china','japanese','thailand','malaysia','tokyo',\n", + "# \"come\",\"chinese\",\"prabhnoor\",\"shikha\",\"singh\",\"mumbai\",\"rubber\",\"kerala\",\"kochi\",\"make\",\"board\",\"data\",\n", + "# \"grade\",\"cents\",\"iran\"])) " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "stop_words = stopwords.words('english')\n", + "stop_words.extend(my_stop_words)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "def remove_stopwords(texts):\n", + " return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]\n", + "\n", + "def make_bigrams(texts):\n", + " return [bigram_mod[doc] for doc in texts]\n", + "\n", + "def make_trigrams(texts):\n", + " return [trigram_mod[bigram_mod[doc]] for doc in texts]\n", + "\n", + "def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):\n", + " texts_out = []\n", + " for sent in texts:\n", + " doc = nlp(\" \".join(sent)) \n", + " texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])\n", + " return texts_out" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "data_words_nostops = remove_stopwords(data_words)\n", + "\n", + "data_words_bigrams = make_bigrams(data_words_nostops)\n", + "\n", + "# Initialize spacy 'en' model, keeping only tagger component (for efficiency)\n", + "nlp = spacy.load(\"en_core_web_sm\", disable=['parser', 'ner'])\n", + "\n", + "# Do lemmatization keeping only noun, adj, vb, adv\n", + "data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "original document: \n", + "['sumitomo', 'rubber', 'industries', 'has', 'established', 'natural', 'rubber', 'procurement', 'subsidiary', 'in', 'singapore', 'the', 'world', 'largest', 'natural', 'rubber', 'trading', 'hub']\n", + "After lemmatization, tokenization:\n", + "['industry', 'establish', 'natural', 'rubber', 'subsidiary', 'large', 'natural', 'rubber', 'trading', 'hub']\n" + ] + } + ], + "source": [ + "print('original document: ')\n", + "print(data_words[:1][0][:30])\n", + "print('After lemmatization, tokenization:')\n", + "print(data_lemmatized[:1][0][:30])" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "words_after_preprocessing= list(itertools.chain.from_iterable(data_lemmatized))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "freq_words(words_after_preprocessing)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Gensim doc2bow-- Creating dictionary from the data" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "id2word = corpora.Dictionary(data_lemmatized)\n", + "\n", + "# Create Corpus\n", + "texts = data_lemmatized\n", + "\n", + "# Term Document Frequency\n", + "corpus = [id2word.doc2bow(text) for text in texts]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "After lemmatization, tokenization:\n", + "['industry', 'establish', 'natural', 'rubber', 'subsidiary', 'large', 'natural', 'rubber', 'trading', 'hub']\n", + "Word 0 (\"establish\") appears 1 time.\n", + "Word 1 (\"hub\") appears 1 time.\n", + "Word 2 (\"industry\") appears 1 time.\n", + "Word 3 (\"large\") appears 1 time.\n", + "Word 4 (\"natural\") appears 2 time.\n", + "Word 5 (\"rubber\") appears 2 time.\n", + "Word 6 (\"subsidiary\") appears 1 time.\n", + "Word 7 (\"trading\") appears 1 time.\n" + ] + } + ], + "source": [ + "print('After lemmatization, tokenization:')\n", + "print(data_lemmatized[:1][0][:30])\n", + "bow_doc_test_0 = corpus[0]\n", + "for i in range(len(bow_doc_test_0)):\n", + " print(\"Word {} (\\\"{}\\\") appears {} time.\".format(bow_doc_test_0[i][0], \n", + " id2word[bow_doc_test_0[i][0]], \n", + " bow_doc_test_0[i][1]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving pickle files" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "# pickle.dump(corpus, open('corpus_bow_ngram.pkl', 'wb'))\n", + "# id2word.save('dictionary_bow_ngram.gensim')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LDA" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "lda_model = gensim.models.LdaMulticore(corpus=corpus,\n", + " id2word=id2word,\n", + " num_topics=10, \n", + " random_state=100,\n", + " chunksize=100,\n", + " passes=10,\n", + " per_word_topics=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, '0.088*\"rubber\" + 0.088*\"price\" + 0.073*\"oil\" + 0.071*\"crude\" + 0.032*\"cue\"')\n", + "(1, '0.060*\"rubber\" + 0.032*\"price\" + 0.020*\"supply\" + 0.017*\"production\" + 0.016*\"low\"')\n", + "(2, '0.022*\"commodity\" + 0.021*\"receive\" + 0.019*\"company\" + 0.016*\"rubber\" + 0.014*\"government\"')\n", + "(3, '0.068*\"rubber\" + 0.064*\"year\" + 0.037*\"natural\" + 0.033*\"month\" + 0.020*\"ton\"')\n", + "(4, '0.115*\"rubber\" + 0.088*\"contract\" + 0.054*\"rupee\" + 0.038*\"yen\" + 0.025*\"end\"')\n", + "(5, '0.083*\"settle\" + 0.025*\"denominate\" + 0.023*\"comment\" + 0.021*\"market\" + 0.020*\"subdue\"')\n", + "(6, '0.032*\"plantation\" + 0.027*\"state\" + 0.021*\"government\" + 0.019*\"price\" + 0.016*\"sector\"')\n", + "(7, '0.038*\"future\" + 0.031*\"contract\" + 0.023*\"trade\" + 0.023*\"yen\" + 0.023*\"position\"')\n", + "(8, '0.029*\"market\" + 0.016*\"growth\" + 0.015*\"term\" + 0.015*\"tax\" + 0.015*\"rate\"')\n", + "(9, '0.093*\"price\" + 0.057*\"rubber\" + 0.053*\"market\" + 0.053*\"rupee\" + 0.046*\"trader\"')\n" + ] + } + ], + "source": [ + "topics = lda_model.print_topics(num_words=5)\n", + "for topic in topics:\n", + " print(topic)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0,\n", + " '0.088*\"rubber\" + 0.088*\"price\" + 0.073*\"oil\" + 0.071*\"crude\" + 0.032*\"cue\" '\n", + " '+ 0.029*\"exchange\" + 0.019*\"substitute\" + 0.019*\"rise\" + 0.016*\"give\" + '\n", + " '0.016*\"future\"'),\n", + " (1,\n", + " '0.060*\"rubber\" + 0.032*\"price\" + 0.020*\"supply\" + 0.017*\"production\" + '\n", + " '0.016*\"low\" + 0.016*\"natural\" + 0.016*\"coming_day\" + 0.014*\"expect\" + '\n", + " '0.012*\"high\" + 0.012*\"year\"'),\n", + " (2,\n", + " '0.022*\"commodity\" + 0.021*\"receive\" + 0.019*\"company\" + 0.016*\"rubber\" + '\n", + " '0.014*\"government\" + 0.013*\"duty\" + 0.012*\"industry\" + 0.012*\"allow\" + '\n", + " '0.012*\"product\" + 0.011*\"capacity\"'),\n", + " (3,\n", + " '0.068*\"rubber\" + 0.064*\"year\" + 0.037*\"natural\" + 0.033*\"month\" + '\n", + " '0.020*\"ton\" + 0.019*\"production\" + 0.019*\"increase\" + 0.017*\"output\" + '\n", + " '0.017*\"consumption\" + 0.016*\"country\"'),\n", + " (4,\n", + " '0.115*\"rubber\" + 0.088*\"contract\" + 0.054*\"rupee\" + 0.038*\"yen\" + '\n", + " '0.025*\"end\" + 0.023*\"analyst\" + 0.022*\"fall\" + 0.021*\"active\" + '\n", + " '0.020*\"close\" + 0.020*\"high\"'),\n", + " (5,\n", + " '0.083*\"settle\" + 0.025*\"denominate\" + 0.023*\"comment\" + 0.021*\"market\" + '\n", + " '0.020*\"subdue\" + 0.017*\"make\" + 0.014*\"cheap\" + 0.014*\"fuel\" + '\n", + " '0.012*\"commodity\" + 0.011*\"expensive\"'),\n", + " (6,\n", + " '0.032*\"plantation\" + 0.027*\"state\" + 0.021*\"government\" + 0.019*\"price\" + '\n", + " '0.016*\"sector\" + 0.014*\"export\" + 0.013*\"report\" + 0.013*\"cardamom\" + '\n", + " '0.012*\"tell\" + 0.012*\"source\"'),\n", + " (7,\n", + " '0.038*\"future\" + 0.031*\"contract\" + 0.023*\"trade\" + 0.023*\"yen\" + '\n", + " '0.023*\"position\" + 0.022*\"delivery\" + 0.022*\"day\" + 0.021*\"rubber\" + '\n", + " '0.017*\"trading\" + 0.012*\"exchange\"'),\n", + " (8,\n", + " '0.029*\"market\" + 0.016*\"growth\" + 0.015*\"term\" + 0.015*\"tax\" + 0.015*\"rate\" '\n", + " '+ 0.014*\"increase\" + 0.014*\"holder\" + 0.012*\"continue\" + 0.011*\"tire\" + '\n", + " '0.010*\"demand\"'),\n", + " (9,\n", + " '0.093*\"price\" + 0.057*\"rubber\" + 0.053*\"market\" + 0.053*\"rupee\" + '\n", + " '0.046*\"trader\" + 0.040*\"rise\" + 0.031*\"demand\" + 0.030*\"fall\" + '\n", + " '0.029*\"domestic\" + 0.027*\"spot\"')]\n" + ] + } + ], + "source": [ + "pprint(lda_model.print_topics())\n", + "doc_lda = lda_model[corpus]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Coherence Score: 0.5396317471542709\n" + ] + } + ], + "source": [ + "coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')\n", + "coherence_lda = coherence_model_lda.get_coherence()\n", + "print('Coherence Score: ', coherence_lda)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Optimal Topics" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):\n", + " coherence_values = []\n", + " model_list = []\n", + " for num_topics in range(start, limit, step):\n", + " model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)\n", + " model_list.append(model)\n", + " coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')\n", + " coherence_values.append(coherencemodel.get_coherence())\n", + "\n", + " return model_list, coherence_values" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, \n", + " start=2, limit=20, step=1)\n", + "limit=20; start=2; step=1;\n", + "x = range(start, limit, step)\n", + "plt.plot(x, coherence_values)\n", + "plt.xlabel(\"Num Topics\")\n", + "plt.ylabel(\"Coherence score\")\n", + "plt.xticks(x)\n", + "plt.legend((\"coherence_values\"), loc='best')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Topics- 3: Best model run" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "lda_model_best_3 = gensim.models.ldamodel.LdaModel(corpus=corpus,\n", + " id2word=id2word,\n", + " num_topics=3, \n", + " random_state=100,\n", + " chunksize=100,\n", + " passes=10,\n", + " alpha='auto',\n", + " eta= 0.61)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "# lda_model_best_3.save('model_lda_bow_ngram_best_3.gensim')" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, '0.021*\"month\" + 0.018*\"year\" + 0.012*\"index\" + 0.009*\"company\" + 0.009*\"inflation\" + 0.009*\"increase\" + 0.008*\"high\" + 0.007*\"rubber\" + 0.007*\"export\" + 0.006*\"state\"'), (1, '0.110*\"rubber\" + 0.066*\"price\" + 0.034*\"market\" + 0.032*\"natural\" + 0.031*\"rise\" + 0.028*\"fall\" + 0.023*\"oil\" + 0.023*\"domestic\" + 0.022*\"crude\" + 0.021*\"trader\"'), (2, '0.090*\"rupee\" + 0.068*\"rubber\" + 0.055*\"contract\" + 0.047*\"price\" + 0.039*\"yen\" + 0.034*\"close\" + 0.025*\"grade\" + 0.024*\"previous\" + 0.019*\"active\" + 0.017*\"rss\"')]\n" + ] + } + ], + "source": [ + "print(lda_model_best_3.print_topics())\n", + "# doc_lda_best = lda_model_best[bow_corpus]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, '0.021*\"month\" + 0.018*\"year\" + 0.012*\"index\" + 0.009*\"company\" + 0.009*\"inflation\"')\n", + "(1, '0.110*\"rubber\" + 0.066*\"price\" + 0.034*\"market\" + 0.032*\"natural\" + 0.031*\"rise\"')\n", + "(2, '0.090*\"rupee\" + 0.068*\"rubber\" + 0.055*\"contract\" + 0.047*\"price\" + 0.039*\"yen\"')\n" + ] + } + ], + "source": [ + "topics_best_3 = lda_model_best_3.print_topics(num_words=5)\n", + "for topic in topics_best_3:\n", + " print(topic)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, '0.021*\"month\" + 0.018*\"year\" + 0.012*\"index\" + 0.009*\"company\" + 0.009*\"inflation\" + 0.009*\"increase\" + 0.008*\"high\" + 0.007*\"rubber\" + 0.007*\"export\" + 0.006*\"state\" + 0.006*\"low\" + 0.006*\"plantation\" + 0.006*\"production\"')\n", + "(1, '0.110*\"rubber\" + 0.066*\"price\" + 0.034*\"market\" + 0.032*\"natural\" + 0.031*\"rise\" + 0.028*\"fall\" + 0.023*\"oil\" + 0.023*\"domestic\" + 0.022*\"crude\" + 0.021*\"trader\" + 0.020*\"contract\" + 0.019*\"demand\" + 0.019*\"likely\"')\n", + "(2, '0.090*\"rupee\" + 0.068*\"rubber\" + 0.055*\"contract\" + 0.047*\"price\" + 0.039*\"yen\" + 0.034*\"close\" + 0.025*\"grade\" + 0.024*\"previous\" + 0.019*\"active\" + 0.017*\"rss\" + 0.016*\"high\" + 0.016*\"low\" + 0.016*\"datum\"')\n" + ] + } + ], + "source": [ + "topics_best_3 = lda_model_best_3.print_topics(num_words=13)\n", + "for topic in topics_best_3:\n", + " print(topic)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Coherence Score for c_v: 0.6172659695956492\n" + ] + } + ], + "source": [ + "coherence_model_lda_c_v_3 = CoherenceModel(model=lda_model_best_3, texts=data_lemmatized, dictionary=id2word, coherence='c_v')\n", + "coherence_lda_3 = coherence_model_lda_c_v_3.get_coherence()\n", + "print('Coherence Score for c_v: ', coherence_lda_3)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "def get_lda_topics(model, num_topics):\n", + " word_dict = {};\n", + " for i in range(num_topics):\n", + " words = model.show_topic(i, topn = 40);\n", + " word_dict['Topic ' + '{:d}'.format(i+1)] = [i[0] for i in words];\n", + " return pd.DataFrame(word_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Topic 1Topic 2Topic 3
0monthrubberrupee
1yearpricerubber
2indexmarketcontract
3companynaturalprice
4inflationriseyen
5increasefallclose
6highoilgrade
7rubberdomesticprevious
8exportcrudeactive
9statetraderrss
10lowcontracthigh
11plantationdemandlow
12productionlikelydatum
13wholesalespottrader
14commoditysupplymarket
15reportkeyexchange
16countryanalystend
17quartergainbourse
18releasebasesettle
19productendboard
20expectglobalhit
21floodexchangeaccord
22ratecuefuture
23accordimportwidely_trade
24rainfallweekweek
25mlnexpectationvariety
26periodtocomcompare
27seefuturechange
28largeoutputgive
29governmentsupportfollowing
30salesubstitutefall
31growthexpectshow
32consumptionproductionclosing
33sectorhighrise
34industrybenchmarkjapanese
35raiseconcerndetail
36damagenewanalyst
37arealowdollar
38totalstocktocom
39raindeclinesell
\n", + "
" + ], + "text/plain": [ + " Topic 1 Topic 2 Topic 3\n", + "0 month rubber rupee\n", + "1 year price rubber\n", + "2 index market contract\n", + "3 company natural price\n", + "4 inflation rise yen\n", + "5 increase fall close\n", + "6 high oil grade\n", + "7 rubber domestic previous\n", + "8 export crude active\n", + "9 state trader rss\n", + "10 low contract high\n", + "11 plantation demand low\n", + "12 production likely datum\n", + "13 wholesale spot trader\n", + "14 commodity supply market\n", + "15 report key exchange\n", + "16 country analyst end\n", + "17 quarter gain bourse\n", + "18 release base settle\n", + "19 product end board\n", + "20 expect global hit\n", + "21 flood exchange accord\n", + "22 rate cue future\n", + "23 accord import widely_trade\n", + "24 rainfall week week\n", + "25 mln expectation variety\n", + "26 period tocom compare\n", + "27 see future change\n", + "28 large output give\n", + "29 government support following\n", + "30 sale substitute fall\n", + "31 growth expect show\n", + "32 consumption production closing\n", + "33 sector high rise\n", + "34 industry benchmark japanese\n", + "35 raise concern detail\n", + "36 damage new analyst\n", + "37 area low dollar\n", + "38 total stock tocom\n", + "39 rain decline sell" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_lda_topics(lda_model_best_3, 3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize topic keywords from best model" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "dictionary = gensim.corpora.Dictionary.load('dictionary_bow_ngram.gensim')\n", + "corpus = pickle.load(open('corpus_bow_ngram.pkl', 'rb'))\n", + "# lda = gensim.models.ldamodel.LdaModel.load('model_lda_bow_ngram_best_3.gensim')" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\plohith\\.conda\\envs\\rpg_env\\lib\\site-packages\\pyLDAvis\\_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " return pd.concat([default_term_info] + list(topic_dfs))\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lda_display = pyLDAvis.gensim.prepare(lda_model_best_3, corpus, dictionary, sort_topics=False)\n", + "pyLDAvis.display(lda_display)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Topic-3: Train Data Mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "documents_copy= documents.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "df_lemma = pd.DataFrame({'lemmatized':data_lemmatized})" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "documents_copy_df_lemma= pd.concat([documents_copy,df_lemma],axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Full.storyfull_storylemmatized
0sumitomo rubber industries has established a n...sumitomo rubber industries has established a n...[industry, establish, natural, rubber, subsidi...
1spot rubber closed unchanged on thursday. rss ...spot rubber closed unchanged on thursday rss ...[spot, rubber, close, unchanged, rss, quote, s...
2delegate registration for india rubber meet 20...delegate registration for india rubber meet i...[meet, hold]
3mumbai – futures contracts of rubber on the in...mumbai futures contracts of rubber on the ind...[rise, ongoing, supply, crunch, market, trader...
4tapping has been delayed despite the fact that...tapping has been delayed despite the fact that...[tap, delay, fact, peak_season, decline, impor...
............
5270cogencis, wednesday, apr 11 new delhi – india’...cogencis wednesday apr new delhi india s nat...[rubber, production, fall, year, accord, relea...
5271cogencis, tuesday, apr 10 by prabhnoor nanda n...cogencis tuesday apr by prabhnoor nanda new d...[contract, rubber, exchange, likely, fall, ses...
5272outlook futures contracts of natural rubber ma...outlook futures contracts of natural rubber ma...[contract, natural, rubber, trade, low, sessio...
5273cogencis, tuesday, apr 10 by shikha singh new ...cogencis tuesday apr by shikha singh new delh...[price, natural, rubber, fall, spot, market, p...
5274cogencis, monday, apr 9 by prabhnoor nanda new...cogencis monday apr by prabhnoor nanda new de...[contract, rubber, exchange, settle, marginall...
\n", + "

5275 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Full.story \\\n", + "0 sumitomo rubber industries has established a n... \n", + "1 spot rubber closed unchanged on thursday. rss ... \n", + "2 delegate registration for india rubber meet 20... \n", + "3 mumbai – futures contracts of rubber on the in... \n", + "4 tapping has been delayed despite the fact that... \n", + "... ... \n", + "5270 cogencis, wednesday, apr 11 new delhi – india’... \n", + "5271 cogencis, tuesday, apr 10 by prabhnoor nanda n... \n", + "5272 outlook futures contracts of natural rubber ma... \n", + "5273 cogencis, tuesday, apr 10 by shikha singh new ... \n", + "5274 cogencis, monday, apr 9 by prabhnoor nanda new... \n", + "\n", + " full_story \\\n", + "0 sumitomo rubber industries has established a n... \n", + "1 spot rubber closed unchanged on thursday rss ... \n", + "2 delegate registration for india rubber meet i... \n", + "3 mumbai futures contracts of rubber on the ind... \n", + "4 tapping has been delayed despite the fact that... \n", + "... ... \n", + "5270 cogencis wednesday apr new delhi india s nat... \n", + "5271 cogencis tuesday apr by prabhnoor nanda new d... \n", + "5272 outlook futures contracts of natural rubber ma... \n", + "5273 cogencis tuesday apr by shikha singh new delh... \n", + "5274 cogencis monday apr by prabhnoor nanda new de... \n", + "\n", + " lemmatized \n", + "0 [industry, establish, natural, rubber, subsidi... \n", + "1 [spot, rubber, close, unchanged, rss, quote, s... \n", + "2 [meet, hold] \n", + "3 [rise, ongoing, supply, crunch, market, trader... \n", + "4 [tap, delay, fact, peak_season, decline, impor... \n", + "... ... \n", + "5270 [rubber, production, fall, year, accord, relea... \n", + "5271 [contract, rubber, exchange, likely, fall, ses... \n", + "5272 [contract, natural, rubber, trade, low, sessio... \n", + "5273 [price, natural, rubber, fall, spot, market, p... \n", + "5274 [contract, rubber, exchange, settle, marginall... \n", + "\n", + "[5275 rows x 3 columns]" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents_copy_df_lemma" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_topic_sentence_120(row):\n", + " bow_vector = id2word.doc2bow(row['lemmatized'])\n", + " row2= sorted(bow_vector, key=lambda x: (x[1]), reverse=True)\n", + " main_keywords = \",\".join([id2word[wor[0]] for index, wor in enumerate(row2)])\n", + " row['Main_keywords']= main_keywords\n", + " \n", + " row1= lda_model_best_3[bow_vector]\n", + " row1 = sorted(row1, key=lambda x: (x[1]), reverse=True)\n", + "\n", + " if len(row1)== 1:\n", + " first_topic_num= int(row1[0][0])\n", + " first_topic_num+= 1\n", + " first_prop_topic= row1[0][1]\n", + " second_topic_num= 0\n", + " second_prop_topic= 0\n", + " else:\n", + " first_topic_num= int(row1[0][0])\n", + " first_topic_num+= 1\n", + " first_prop_topic= row1[0][1]\n", + " second_topic_num= int(row1[1][0])\n", + " second_topic_num+= 1\n", + " second_prop_topic= row1[1][1]\n", + "\n", + " row['First_topic_number']= int(first_topic_num) \n", + " if int(first_topic_num)== 1:\n", + " row['First_topic_name']= 'Global economy'\n", + " elif int(first_topic_num)== 2:\n", + " row['First_topic_name']= 'Rate change'\n", + " else:\n", + " row['First_topic_name']= 'Rubber trade' \n", + " row['First_topic_propability']= round(first_prop_topic,3)\n", + "\n", + "\n", + " row['Second_topic_number']= int(second_topic_num)\n", + " if int(second_topic_num)== 1:\n", + " row['Second_topic_name']= 'Global economy'\n", + " elif int(second_topic_num)== 2:\n", + " row['Second_topic_name']= 'Rate change'\n", + " elif int(second_topic_num)== 3:\n", + " row['Second_topic_name']= 'Rubber trade' \n", + " else:\n", + " row['Second_topic_name']= 'Unidentified'\n", + " row['Second_topic_propability']= round(second_prop_topic,3)\n", + " \n", + " \n", + " return row " + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "doc120= documents_copy_df_lemma.apply(identify_topic_sentence_120,axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Full.storyfull_storylemmatizedMain_keywordsFirst_topic_numberFirst_topic_nameFirst_topic_propabilitySecond_topic_numberSecond_topic_nameSecond_topic_propability
0sumitomo rubber industries has established a n...sumitomo rubber industries has established a n...[industry, establish, natural, rubber, subsidi...natural,rubber,establish,hub,industry,large,su...2Rate change0.5211Global economy0.466
1spot rubber closed unchanged on thursday. rss ...spot rubber closed unchanged on thursday rss ...[spot, rubber, close, unchanged, rss, quote, s...rubber,accord,close,dealer,finish,flat,quote,r...3Rubber trade0.9782Rate change0.016
2delegate registration for india rubber meet 20...delegate registration for india rubber meet i...[meet, hold]hold,meet1Global economy0.8572Rate change0.085
3mumbai – futures contracts of rubber on the in...mumbai futures contracts of rubber on the ind...[rise, ongoing, supply, crunch, market, trader...close,trader,active,contract,crunch,end,market...3Rubber trade0.5292Rate change0.464
4tapping has been delayed despite the fact that...tapping has been delayed despite the fact that...[tap, delay, fact, peak_season, decline, impor...rubber,trader,supply,base,decline,deepen,delay...2Rate change0.8341Global economy0.155
.................................
5270cogencis, wednesday, apr 11 new delhi – india’...cogencis wednesday apr new delhi india s nat...[rubber, production, fall, year, accord, relea...year,rubber,end,rise,natural,month,consumption...2Rate change0.6661Global economy0.330
5271cogencis, tuesday, apr 10 by prabhnoor nanda n...cogencis tuesday apr by prabhnoor nanda new d...[contract, rubber, exchange, likely, fall, ses...rubber,price,fall,natural,demand,likely,week,c...2Rate change0.7033Rubber trade0.168
5272outlook futures contracts of natural rubber ma...outlook futures contracts of natural rubber ma...[contract, natural, rubber, trade, low, sessio...rubber,price,natural,crude,oil,track,contract,...2Rate change0.7943Rubber trade0.204
5273cogencis, tuesday, apr 10 by shikha singh new ...cogencis tuesday apr by shikha singh new delh...[price, natural, rubber, fall, spot, market, p...rubber,close,price,natural,market,previous,rup...2Rate change0.4853Rubber trade0.447
5274cogencis, monday, apr 9 by prabhnoor nanda new...cogencis monday apr by prabhnoor nanda new de...[contract, rubber, exchange, settle, marginall...rubber,price,close,contract,exchange,market,ri...3Rubber trade0.5312Rate change0.469
\n", + "

5275 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " Full.story \\\n", + "0 sumitomo rubber industries has established a n... \n", + "1 spot rubber closed unchanged on thursday. rss ... \n", + "2 delegate registration for india rubber meet 20... \n", + "3 mumbai – futures contracts of rubber on the in... \n", + "4 tapping has been delayed despite the fact that... \n", + "... ... \n", + "5270 cogencis, wednesday, apr 11 new delhi – india’... \n", + "5271 cogencis, tuesday, apr 10 by prabhnoor nanda n... \n", + "5272 outlook futures contracts of natural rubber ma... \n", + "5273 cogencis, tuesday, apr 10 by shikha singh new ... \n", + "5274 cogencis, monday, apr 9 by prabhnoor nanda new... \n", + "\n", + " full_story \\\n", + "0 sumitomo rubber industries has established a n... \n", + "1 spot rubber closed unchanged on thursday rss ... \n", + "2 delegate registration for india rubber meet i... \n", + "3 mumbai futures contracts of rubber on the ind... \n", + "4 tapping has been delayed despite the fact that... \n", + "... ... \n", + "5270 cogencis wednesday apr new delhi india s nat... \n", + "5271 cogencis tuesday apr by prabhnoor nanda new d... \n", + "5272 outlook futures contracts of natural rubber ma... \n", + "5273 cogencis tuesday apr by shikha singh new delh... \n", + "5274 cogencis monday apr by prabhnoor nanda new de... \n", + "\n", + " lemmatized \\\n", + "0 [industry, establish, natural, rubber, subsidi... \n", + "1 [spot, rubber, close, unchanged, rss, quote, s... \n", + "2 [meet, hold] \n", + "3 [rise, ongoing, supply, crunch, market, trader... \n", + "4 [tap, delay, fact, peak_season, decline, impor... \n", + "... ... \n", + "5270 [rubber, production, fall, year, accord, relea... \n", + "5271 [contract, rubber, exchange, likely, fall, ses... \n", + "5272 [contract, natural, rubber, trade, low, sessio... \n", + "5273 [price, natural, rubber, fall, spot, market, p... \n", + "5274 [contract, rubber, exchange, settle, marginall... \n", + "\n", + " Main_keywords First_topic_number \\\n", + "0 natural,rubber,establish,hub,industry,large,su... 2 \n", + "1 rubber,accord,close,dealer,finish,flat,quote,r... 3 \n", + "2 hold,meet 1 \n", + "3 close,trader,active,contract,crunch,end,market... 3 \n", + "4 rubber,trader,supply,base,decline,deepen,delay... 2 \n", + "... ... ... \n", + "5270 year,rubber,end,rise,natural,month,consumption... 2 \n", + "5271 rubber,price,fall,natural,demand,likely,week,c... 2 \n", + "5272 rubber,price,natural,crude,oil,track,contract,... 2 \n", + "5273 rubber,close,price,natural,market,previous,rup... 2 \n", + "5274 rubber,price,close,contract,exchange,market,ri... 3 \n", + "\n", + " First_topic_name First_topic_propability Second_topic_number \\\n", + "0 Rate change 0.521 1 \n", + "1 Rubber trade 0.978 2 \n", + "2 Global economy 0.857 2 \n", + "3 Rubber trade 0.529 2 \n", + "4 Rate change 0.834 1 \n", + "... ... ... ... \n", + "5270 Rate change 0.666 1 \n", + "5271 Rate change 0.703 3 \n", + "5272 Rate change 0.794 3 \n", + "5273 Rate change 0.485 3 \n", + "5274 Rubber trade 0.531 2 \n", + "\n", + " Second_topic_name Second_topic_propability \n", + "0 Global economy 0.466 \n", + "1 Rate change 0.016 \n", + "2 Rate change 0.085 \n", + "3 Rate change 0.464 \n", + "4 Global economy 0.155 \n", + "... ... ... \n", + "5270 Global economy 0.330 \n", + "5271 Rubber trade 0.168 \n", + "5272 Rubber trade 0.204 \n", + "5273 Rubber trade 0.447 \n", + "5274 Rate change 0.469 \n", + "\n", + "[5275 rows x 10 columns]" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc120" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "doc120.drop(['lemmatized', 'full_story'],axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "doc120= doc120[['First_topic_number', 'First_topic_propability', 'First_topic_name',\n", + " 'Second_topic_number','Second_topic_propability','Second_topic_name',\n", + " 'Main_keywords','Full.story']]" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "doc120.to_excel('Topic_modelling_3.xlsx', float_format= \"%.2f\", index= False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tuning Hyperparameters of LDA" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_coherence_values(corpus, dictionary, k, a, b):\n", + " \n", + " lda_model_hyper = gensim.models.LdaMulticore(corpus=corpus,\n", + " id2word=dictionary,\n", + " num_topics=k, \n", + " random_state=100,\n", + " chunksize=100,\n", + " passes=10,\n", + " alpha=a,\n", + " eta=b)\n", + " \n", + " coherence_model_lda = CoherenceModel(model=lda_model_hyper, texts=data_lemmatized, dictionary=id2word, coherence='c_v')\n", + " \n", + " return coherence_model_lda.get_coherence()" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████████████████████████████████████████████████████████████████████████| 270/270 [1:38:06<00:00, 21.80s/it]\n" + ] + } + ], + "source": [ + "grid = {}\n", + "grid['Validation_Set'] = {}\n", + "\n", + "min_topics = 2\n", + "max_topics = 11\n", + "step_size = 1\n", + "topics_range = range(min_topics, max_topics, step_size)\n", + "\n", + "alpha = list(np.arange(0.01, 1, 0.3))\n", + "alpha.append('symmetric')\n", + "alpha.append('asymmetric')\n", + "\n", + "beta = list(np.arange(0.01, 1, 0.3))\n", + "beta.append('symmetric')\n", + "\n", + "# Validation sets\n", + "num_of_docs = len(corpus)\n", + "corpus_sets = [corpus]\n", + "\n", + "corpus_title = ['100% Corpus']\n", + "\n", + "model_results = {'Validation_Set': [],\n", + " 'Topics': [],\n", + " 'Alpha': [],\n", + " 'Beta': [],\n", + " 'Coherence': []\n", + " }\n", + "\n", + "if 1 == 1:\n", + " pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))\n", + " \n", + " for i in range(len(corpus_sets)):\n", + " for k in topics_range: #topic iterate\n", + " for a in alpha: # alpha iterate\n", + " for b in beta: #beta iterate\n", + " cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, k=k, a=a, b=b)\n", + " # Save the model results\n", + " model_results['Validation_Set'].append(corpus_title[i])\n", + " model_results['Topics'].append(k)\n", + " model_results['Alpha'].append(a)\n", + " model_results['Beta'].append(b)\n", + " model_results['Coherence'].append(cv)\n", + " \n", + " pbar.update(1)\n", + " pd.DataFrame(model_results).to_csv('lda_bow_ngram_tuning_results.csv', index=False)\n", + " pbar.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Web Scraping_Website.ipynb b/Web Scraping_Website.ipynb new file mode 100644 index 0000000..d3fc728 --- /dev/null +++ b/Web Scraping_Website.ipynb @@ -0,0 +1,1694 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: beautifulsoup4 in c:\\programdata\\anaconda3\\lib\\site-packages (4.8.0)\n", + "Requirement already satisfied: requests in c:\\programdata\\anaconda3\\lib\\site-packages (2.22.0)\n", + "Requirement already satisfied: pandas in c:\\programdata\\anaconda3\\lib\\site-packages (0.25.1)\n", + "Requirement already satisfied: soupsieve>=1.2 in c:\\programdata\\anaconda3\\lib\\site-packages (from beautifulsoup4) (1.9.3)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from requests) (1.24.2)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in c:\\programdata\\anaconda3\\lib\\site-packages (from requests) (2.8)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\programdata\\anaconda3\\lib\\site-packages (from requests) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\programdata\\anaconda3\\lib\\site-packages (from requests) (2019.9.11)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\programdata\\anaconda3\\lib\\site-packages (from pandas) (2019.3)\n", + "Requirement already satisfied: python-dateutil>=2.6.1 in c:\\programdata\\anaconda3\\lib\\site-packages (from pandas) (2.8.0)\n", + "Requirement already satisfied: numpy>=1.13.3 in c:\\programdata\\anaconda3\\lib\\site-packages (from pandas) (1.16.5)\n", + "Requirement already satisfied: six>=1.5 in c:\\programdata\\anaconda3\\lib\\site-packages (from python-dateutil>=2.6.1->pandas) (1.12.0)\n" + ] + } + ], + "source": [ + "!pip install beautifulsoup4 requests pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup\n", + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://subaahsavere.com/?s=commodity\"" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "res = requests.get(url) # it opens up the connection and grabs the webpage\n", + "print(res)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " \n", + "\n", + "You searched for rubber - Global Rubber Markets\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "Search\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\r\n", + "\r\n", + " Wednesday, January 22, 2020\r\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
Home Search
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

\n", + "rubber - search results\n", + "

\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\r\n", + " If you're not happy with the results, please do another search
\n", + "
\n", + "
\n", + "
\"\"
\n", + "
\n", + "

Tokyo rubber slumps over 4 percent

\n", + "
\n", + "
\n", + "
\r\n", + " Tokyo Commodity Exchange (TOCOM) futures plunged more than 4% on Tuesday, their most in 22 months, as concerns over the potential economic fallout from...
\n", + "
\n", + "
\n", + "
\n", + "
\"\"
\n", + "
\n", + "

India Rubber: Tad dn on ICEX on TOCOM cues; rise in spot limits fall

\n", + "
\n", + "
\n", + "
\r\n", + " \r\n", + " Back \r\n", + "\r\n", + " ...
\n", + "
\n", + "
\n", + "
\n", + "
\"\"
\n", + "
\n", + "

India Rubber: Down on ICEX tailing TOCOM contracts; up in spot mkts

\n", + "
\n", + "
\n", + "
\r\n", + " \r\n", + " Back \r\n", + "\r\n", + " ...
\n", + "
\n", + "
\n", + "
\n", + "
\"\"
\n", + "
\n", + "

Tokyo rubber hits one-week low

\n", + "
\n", + "
\n", + "
\r\n", + " Tokyo Commodity Exchange (TOCOM) futures slipped to an about 1-week low on Monday, as investors took profits after the benchmark touched a 10-1/2-month high...
\n", + "
\n", + "
\n", + "
\n", + "
\"\"
\n", + "
\n", + "

India Rubber: Down on ICEX on short selling, tailing TOCOM contracts

\n", + "
\n", + "
\n", + "
\r\n", + " \r\n", + " Back \r\n", + "\r\n", + " ...
\n", + "
\n", + "
\n", + "
\n", + "
\"Natural
\n", + "
\n", + "

Natural Rubber prices surge at tail end of peak season

\n", + "
\n", + "
\n", + "
\r\n", + " From Rs 120 per kg in the first week of November, the price of premium grade RSS-4 has crossed Rs 137 per kg this...
\n", + "
\n", + "
\n", + "
\n", + "
\"Chinese
\n", + "
\n", + "

Chinese Natural Rubber Daily Market Price Report: January 16, 2019

\n", + "
\n", + "
\n", + "
\r\n", + " Chinese Natural Rubber Daily Market Price Report: January 16, 2019\r\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\"\"
\n", + "
\n", + "

Asian physical rubber prices: January 16, 2020

\n", + "
\n", + "
\n", + "
\r\n", + " Jan 16 (Reuters) - Asian physical rubber prices\r\n", + "\r\n", + "\r\n", + "\r\n", + "Grade\r\n", + "Prices\r\n", + "\r\n", + "\r\n", + "Thai RSS3 (February)\r\n", + "$1.66/kg\r\n", + "\r\n", + "\r\n", + "Thai STR20 (February)\r\n", + "$1.52/kg\r\n", + "\r\n", + "\r\n", + "Thai 60% latex (bulk/February)\r\n", + "$1,150/tonne\r\n", + "\r\n", + "\r\n", + "Thai 60% latex (drum/February)\r\n", + "$1,250/tonne\r\n", + "\r\n", + "\r\n", + "Malaysia SMR20 (February)\r\n", + "$1.52/kg\r\n", + "\r\n", + "\r\n", + "Indonesia SIR20*\r\n", + "$1.54/Kg\r\n", + "\r\n", + "\r\n", + "Thai USS3\r\n", + "41.94 baht/kg\r\n", + "\r\n", + "\r\n", + "\r\n", + "* As...
\n", + "
\n", + "
\n", + "
\n", + "
\"\"
\n", + "
\n", + "

India Rubber: Hits 4-month high on ICEX tracking gains in Kerala mkt

\n", + "
\n", + "
\n", + "
\r\n", + " By Rahul Dhuri\r\n", + "MUMBAI – The most active February contract of rubber hit a four-month high of 14,380 rupees per 100 kg on the Indian...
\n", + "
\n", + "
\n", + "
\n", + "
\"\"
\n", + "
\n", + "

India: Spot rubber shows a mixed mood

\n", + "
\n", + "
\n", + "
\r\n", + " \r\n", + "\r\n", + "\r\n", + "Spot rubber showed a mixed mood on Thursday. RSS 4 was quoted steady at Rs.137.50 per kg by traders. The same improved to Rs.137.50...
\n", + "
\n", + "
\n", + "
123...5,127Page 1 of 5,127
\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + "

Advertisement

\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "

LATEST

\n", + "
\n", + "
\n", + "
\"\"
\n", + "
\n", + "

Oil market shrugs off Libya crisis amid ample global supply

\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\"\"
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\"\"
\n", + "
\n", + "

Egypt may add Indian wheat to list of acceptable import origins

\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\"\"
\n", + "
\n", + "

Expect A Strong Year For Oil Discoveries

\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\"\"
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\"\"
\n", + "
\n", + "

Spanish olive growers claim first victory against U.S. duties

\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\"\"
\n", + "
\n", + "

Trump says middle class tax cut to be announced over next...

\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(soup)" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "January 22, 2020\n", + "Tokyo rubber slumps over 4 percent\n", + "Tokyo Commodity Exchange (TOCOM) futures plunged more than 4% on Tuesday, their most in 22 months, as concerns over the potential economic fallout from... \n", + "https://globalrubbermarkets.com/194165/tokyo-rubber-slumps-over-4-percent.html\n", + "\n", + "\n", + "January 22, 2020\n", + "India Rubber: Tad dn on ICEX on TOCOM cues; rise in spot limits fall\n", + "Back \r\n", + "\r\n", + " ... \n", + "https://globalrubbermarkets.com/194109/india-rubber-tad-dn-on-icex-on-tocom-cues-rise-in-spot-limits-fall.html\n", + "\n", + "\n", + "January 21, 2020\n", + "India Rubber: Down on ICEX tailing TOCOM contracts; up in spot mkts\n", + "Back \r\n", + "\r\n", + " ... \n", + "https://globalrubbermarkets.com/193884/india-rubber-down-on-icex-tailing-tocom-contracts-up-in-spot-mkts.html\n", + "\n", + "\n", + "January 20, 2020\n", + "Tokyo rubber hits one-week low\n", + "Tokyo Commodity Exchange (TOCOM) futures slipped to an about 1-week low on Monday, as investors took profits after the benchmark touched a 10-1/2-month high... \n", + "https://globalrubbermarkets.com/193865/tokyo-rubber-hits-one-week-low.html\n", + "\n", + "\n", + "January 17, 2020\n", + "India Rubber: Down on ICEX on short selling, tailing TOCOM contracts\n", + "Back \r\n", + "\r\n", + " ... \n", + "https://globalrubbermarkets.com/193518/india-rubber-down-on-icex-on-short-selling-tailing-tocom-contracts.html\n", + "\n", + "\n", + "January 17, 2020\n", + "Natural Rubber prices surge at tail end of peak season\n", + "From Rs 120 per kg in the first week of November, the price of premium grade RSS-4 has crossed Rs 137 per kg this... \n", + "https://globalrubbermarkets.com/193348/natural-rubber-prices-surge-at-tail-end-of-peak-season.html\n", + "\n", + "\n", + "January 17, 2020\n", + "Chinese Natural Rubber Daily Market Price Report: January 16, 2019\n", + "Chinese Natural Rubber Daily Market Price Report: January 16, 2019\r\n", + " \n", + "https://globalrubbermarkets.com/193343/chinese-natural-rubber-daily-market-price-report-january-16-2019.html\n", + "\n", + "\n", + "January 17, 2020\n", + "Asian physical rubber prices: January 16, 2020\n", + "Jan 16 (Reuters) - Asian physical rubber prices\r\n", + "\r\n", + "\r\n", + "\r\n", + "Grade\r\n", + "Prices\r\n", + "\r\n", + "\r\n", + "Thai RSS3 (February)\r\n", + "$1.66/kg\r\n", + "\r\n", + "\r\n", + "Thai STR20 (February)\r\n", + "$1.52/kg\r\n", + "\r\n", + "\r\n", + "Thai 60% latex (bulk/February)\r\n", + "$1,150/tonne\r\n", + "\r\n", + "\r\n", + "Thai 60% latex (drum/February)\r\n", + "$1,250/tonne\r\n", + "\r\n", + "\r\n", + "Malaysia SMR20 (February)\r\n", + "$1.52/kg\r\n", + "\r\n", + "\r\n", + "Indonesia SIR20*\r\n", + "$1.54/Kg\r\n", + "\r\n", + "\r\n", + "Thai USS3\r\n", + "41.94 baht/kg\r\n", + "\r\n", + "\r\n", + "\r\n", + "* As... \n", + "https://globalrubbermarkets.com/193342/asian-physical-rubber-prices-january-16-2020.html\n", + "\n", + "\n", + "January 17, 2020\n", + "India Rubber: Hits 4-month high on ICEX tracking gains in Kerala mkt\n", + "By Rahul Dhuri\r\n", + "MUMBAI – The most active February contract of rubber hit a four-month high of 14,380 rupees per 100 kg on the Indian... \n", + "https://globalrubbermarkets.com/193340/india-rubber-hits-4-month-high-on-icex-tracking-gains-in-kerala-mkt-2.html\n", + "\n", + "\n", + "January 17, 2020\n", + "India: Spot rubber shows a mixed mood\n", + "Spot rubber showed a mixed mood on Thursday. RSS 4 was quoted steady at Rs.137.50 per kg by traders. The same improved to Rs.137.50... \n", + "https://globalrubbermarkets.com/193338/india-spot-rubber-shows-a-mixed-mood.html\n", + "\n", + "\n" + ] + } + ], + "source": [ + "container = soup.findAll(\"div\",{\"class\":\"td_module_16 td_module_wrap td-animation-stack\"})\n", + "all_links = list()\n", + "for i in container:\n", + " headlines = i.findAll(\"div\",{\"class\":\"item-details\"})\n", + " for j in headlines:\n", + " date = j.find(\"span\",{\"class\":\"td-post-date\"})\n", + " print(date.text)\n", + " title = j.find(\"h3\",{\"class\":\"entry-title td-module-title\"})\n", + " print(title.text)\n", + " summary = j.find(\"div\",{\"class\":\"td-excerpt\"})\n", + " print((summary.text).lstrip())\n", + " link = j.find(\"a\")\n", + " print(link['href'])\n", + " all_links.append(link['href'])\n", + " print(\"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['https://globalrubbermarkets.com/194165/tokyo-rubber-slumps-over-4-percent.html',\n", + " 'https://globalrubbermarkets.com/194109/india-rubber-tad-dn-on-icex-on-tocom-cues-rise-in-spot-limits-fall.html',\n", + " 'https://globalrubbermarkets.com/193884/india-rubber-down-on-icex-tailing-tocom-contracts-up-in-spot-mkts.html',\n", + " 'https://globalrubbermarkets.com/193865/tokyo-rubber-hits-one-week-low.html',\n", + " 'https://globalrubbermarkets.com/193518/india-rubber-down-on-icex-on-short-selling-tailing-tocom-contracts.html',\n", + " 'https://globalrubbermarkets.com/193348/natural-rubber-prices-surge-at-tail-end-of-peak-season.html',\n", + " 'https://globalrubbermarkets.com/193343/chinese-natural-rubber-daily-market-price-report-january-16-2019.html',\n", + " 'https://globalrubbermarkets.com/193342/asian-physical-rubber-prices-january-16-2020.html',\n", + " 'https://globalrubbermarkets.com/193340/india-rubber-hits-4-month-high-on-icex-tracking-gains-in-kerala-mkt-2.html',\n", + " 'https://globalrubbermarkets.com/193338/india-spot-rubber-shows-a-mixed-mood.html']" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_links" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tokyo Commodity Exchange (TOCOM) futures plunged more than 4% on Tuesday, their most in 22 months, as concerns over the potential economic fallout from an outbreak of a new flu-like virus in China weighed on prices.\n", + "TOCOM’s rubber contract for June delivery finished 8.4 yen, or 4.2%, lower at 193.0 yen ($1.76) per kg, marking the biggest one-day drop since March 2018. It touched the lowest since Dec. 23 of 192.2 yen earlier in the session.\n", + "The most-active rubber contract on the Shanghai futures exchange for May delivery tumbled 345 yuan to finish at 12,620 yuan ($1,827) per tonne.\n", + "The front-month rubber contract on Singapore’s SICOM exchange for February delivery last traded at 147.2 US cents per kg, down 2.3%.\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Full.story
0Sumitomo Rubber Industries has established a n...
1Spot rubber closed unchanged on Thursday. RSS ...
2Delegate registration for India Rubber Meet 20...
3MUMBAI – Futures contracts of rubber on the In...
4Tapping has been delayed despite the fact that...
......
5270Cogencis, Wednesday, Apr 11 NEW DELHI – India’...
5271Cogencis, Tuesday, Apr 10 By Prabhnoor Nanda N...
5272OUTLOOK Futures contracts of natural rubber ma...
5273Cogencis, Tuesday, Apr 10 By Shikha Singh NEW ...
5274Cogencis, Monday, Apr 9 By Prabhnoor Nanda NEW...
\n", + "

5275 rows × 1 columns

\n", + "" + ], + "text/plain": [ + " Full.story\n", + "0 Sumitomo Rubber Industries has established a n...\n", + "1 Spot rubber closed unchanged on Thursday. RSS ...\n", + "2 Delegate registration for India Rubber Meet 20...\n", + "3 MUMBAI – Futures contracts of rubber on the In...\n", + "4 Tapping has been delayed despite the fact that...\n", + "... ...\n", + "5270 Cogencis, Wednesday, Apr 11 NEW DELHI – India’...\n", + "5271 Cogencis, Tuesday, Apr 10 By Prabhnoor Nanda N...\n", + "5272 OUTLOOK Futures contracts of natural rubber ma...\n", + "5273 Cogencis, Tuesday, Apr 10 By Shikha Singh NEW ...\n", + "5274 Cogencis, Monday, Apr 9 By Prabhnoor Nanda NEW...\n", + "\n", + "[5275 rows x 1 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data.rename(columns = {'Full.story':'full_story'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_story
0Sumitomo Rubber Industries has established a n...
1Spot rubber closed unchanged on Thursday. RSS ...
2Delegate registration for India Rubber Meet 20...
3MUMBAI – Futures contracts of rubber on the In...
4Tapping has been delayed despite the fact that...
......
5270Cogencis, Wednesday, Apr 11 NEW DELHI – India’...
5271Cogencis, Tuesday, Apr 10 By Prabhnoor Nanda N...
5272OUTLOOK Futures contracts of natural rubber ma...
5273Cogencis, Tuesday, Apr 10 By Shikha Singh NEW ...
5274Cogencis, Monday, Apr 9 By Prabhnoor Nanda NEW...
\n", + "

5275 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " full_story\n", + "0 Sumitomo Rubber Industries has established a n...\n", + "1 Spot rubber closed unchanged on Thursday. RSS ...\n", + "2 Delegate registration for India Rubber Meet 20...\n", + "3 MUMBAI – Futures contracts of rubber on the In...\n", + "4 Tapping has been delayed despite the fact that...\n", + "... ...\n", + "5270 Cogencis, Wednesday, Apr 11 NEW DELHI – India’...\n", + "5271 Cogencis, Tuesday, Apr 10 By Prabhnoor Nanda N...\n", + "5272 OUTLOOK Futures contracts of natural rubber ma...\n", + "5273 Cogencis, Tuesday, Apr 10 By Shikha Singh NEW ...\n", + "5274 Cogencis, Monday, Apr 9 By Prabhnoor Nanda NEW...\n", + "\n", + "[5275 rows x 1 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5275, 1)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents = data[['full_story']]\n", + "documents.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_story
0Sumitomo Rubber Industries has established a n...
1Spot rubber closed unchanged on Thursday. RSS ...
2Delegate registration for India Rubber Meet 20...
3MUMBAI – Futures contracts of rubber on the In...
4Tapping has been delayed despite the fact that...
......
5270Cogencis, Wednesday, Apr 11 NEW DELHI – India’...
5271Cogencis, Tuesday, Apr 10 By Prabhnoor Nanda N...
5272OUTLOOK Futures contracts of natural rubber ma...
5273Cogencis, Tuesday, Apr 10 By Shikha Singh NEW ...
5274Cogencis, Monday, Apr 9 By Prabhnoor Nanda NEW...
\n", + "

5275 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " full_story\n", + "0 Sumitomo Rubber Industries has established a n...\n", + "1 Spot rubber closed unchanged on Thursday. RSS ...\n", + "2 Delegate registration for India Rubber Meet 20...\n", + "3 MUMBAI – Futures contracts of rubber on the In...\n", + "4 Tapping has been delayed despite the fact that...\n", + "... ...\n", + "5270 Cogencis, Wednesday, Apr 11 NEW DELHI – India’...\n", + "5271 Cogencis, Tuesday, Apr 10 By Prabhnoor Nanda N...\n", + "5272 OUTLOOK Futures contracts of natural rubber ma...\n", + "5273 Cogencis, Tuesday, Apr 10 By Shikha Singh NEW ...\n", + "5274 Cogencis, Monday, Apr 9 By Prabhnoor Nanda NEW...\n", + "\n", + "[5275 rows x 1 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_story
4310Rubber futures on TOCOM were trading higher du...
\n", + "
" + ], + "text/plain": [ + " full_story\n", + "4310 Rubber futures on TOCOM were trading higher du..." + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents[documents.index == 4310]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EDA" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install wordcloud" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from wordcloud import WordCloud" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Join the different processed titles together.\n", + "long_string = ','.join(list(documents['full_story'].values))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wordcloud = WordCloud(background_color=\"white\", max_words=5000, contour_width=8, contour_color='steelblue')\n", + "wordcloud.generate(long_string)\n", + "wordcloud.to_image()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def freq_words(x, terms = 30):\n", + " all_words = ' '.join([text for text in x])\n", + " all_words = all_words.split()\n", + "\n", + " fdist = FreqDist(all_words)\n", + " words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})\n", + "\n", + " d = words_df.nlargest(columns=\"count\", n = terms) \n", + " plt.figure(figsize=(20,5))\n", + " ax = sns.barplot(data=d, x= \"word\", y = \"count\")\n", + " ax.set(ylabel = 'Count')\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "freq_words(documents['full_story'])" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "documents['full_story'] = documents['full_story'].str.replace(\"[^a-zA-Z#@]\", \" \")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data processing" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# import nltk\n", + "# nltk.download('wordnet')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [], + "source": [ + "my_stop_words = STOPWORDS.union(set(['send', 'comment', 'feedback', 'today', 'come', 'harsh', 'akshit', 'nishant', 'chakraborty', 'augustine',\n", + " 'cogencis', 'com', 'rahul', 'dhuri', 'end-users','rupeesedited', 'mugunthan', 'kesavan', 'says', 'said', 'say'\n", + " 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', \n", + " 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', \n", + " 'november', 'december','kerela','delhi','kottayam','india','china','japanese','thailand','malaysia','tokyo'\n", + " , \"come\",\"chinese\",\"prabhnoor\",\"shikha\",\"singh\",\"mumbai\",\"rubber\",\"kerala\",\"kochi\",\"make\",\"board\",\"data\",\"grade\"\n", + " ,\"cents\",\"show\",\"table\",\"detail\",\"change\",\"give\",\"nanda\",\"geojit\",\"iran\"])) " + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "def lemmatize_stemming(text):\n", + "# ps=SnowballStemmer(language='english')\n", + " return WordNetLemmatizer().lemmatize(text, pos='v')\n", + "\n", + "def preprocess(text):\n", + " result = []\n", + " for token in gensim.utils.simple_preprocess(text):\n", + " if token not in gensim.parsing.preprocessing.STOPWORDS and token not in my_stop_words and len(token) > 3:\n", + " result.append(lemmatize_stemming(token))\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ + "doc_sample = documents[documents.index == 4310].values[0][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Rubber futures on TOCOM were trading higher due to signs of a recovery in the Chinese economy after the release of stronger than expected data on Friday analysts said China is the largest consumer of natural rubber '" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc_sample" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "original document: \n", + "['Rubber', 'futures', 'on', 'TOCOM', 'were', 'trading', 'higher', 'due', 'to', 'signs', 'of', 'a', 'recovery', 'in', 'the', 'Chinese', 'economy', '', 'after', 'the', 'release', 'of', 'stronger', 'than', 'expected', 'data', 'on', 'Friday', '', 'analysts', 'said', '', 'China', 'is', 'the', 'largest', 'consumer', 'of', 'natural', 'rubber', '']\n", + "\n", + "Tokenized and Lemmatized document: \n", + "['futures', 'tocom', 'trade', 'higher', 'sign', 'recovery', 'economy', 'release', 'stronger', 'expect', 'analysts', 'largest', 'consumer', 'natural']\n" + ] + } + ], + "source": [ + "print('original document: ')\n", + "words = []\n", + "for word in doc_sample.split(' '):\n", + " words.append(word)\n", + "print(words)\n", + "\n", + "print('\\nTokenized and Lemmatized document: ')\n", + "print(preprocess(doc_sample))" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [sumitomo, industries, establish, natural, pro...\n", + "1 [spot, close, unchanged, quote, steady, trader...\n", + "2 [delegate, registration, meet, begin, meet, ho...\n", + "3 [futures, contract, indian, commodity, exchang...\n", + "4 [tap, delay, despite, fact, peak, season, decl...\n", + " ... \n", + "5270 [natural, production, fell, year, accord, prov...\n", + "5271 [futures, contract, national, multi, commodity...\n", + "5272 [outlook, futures, contract, natural, trade, l...\n", + "5273 [price, natural, fell, spot, market, poor, dem...\n", + "5274 [monday, futures, contract, national, multi, c...\n", + "Name: full_story, Length: 5275, dtype: object" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "processed_docs = documents['full_story'].map(preprocess)\n", + "processed_docs" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [], + "source": [ + "def merging_preprocess(row):\n", + " values = ' '.join(str(row[v]) for v in range(len(row)))\n", + " return values" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [], + "source": [ + "documents['preprocess_join']=processed_docs.apply(merging_preprocess)" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 sumitomo industries establish natural procurem...\n", + "1 spot close unchanged quote steady traders fini...\n", + "2 delegate registration meet begin meet hold rad...\n", + "3 futures contract indian commodity exchange ris...\n", + "4 tap delay despite fact peak season decline imp...\n", + " ... \n", + "5270 natural production fell year accord provisiona...\n", + "5271 futures contract national multi commodity exch...\n", + "5272 outlook futures contract natural trade lower s...\n", + "5273 price natural fell spot market poor demand tra...\n", + "5274 monday futures contract national multi commodi...\n", + "Name: preprocess_join, Length: 5275, dtype: object" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents['preprocess_join']" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABJgAAAE9CAYAAABHvdhKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3deZwtV1kv/N+ThEmDTAmIgJ4oUUBUlICIKGG4jGJAwvSihAg3L1euwetFBceAqKC+wgURZEyACIQwDzIICZNMIRMEEpMLiUSQBAgoMgbW+8danbPT2bt796nu0+ck3+/n05+uXbt2raeqVq1a+6lhV2stAAAAALCr9tnuAAAAAADYu0kwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADDJftsdwFY44IAD2o4dO7Y7DAAAAIArjY9+9KNfaK0dOO+9K2WCaceOHTnllFO2OwwAAACAK42qumDRe26RAwAAAGASCSYAAAAAJpFgAgAAAGASCSYAAAAAJpFgAgAAAGASCSYAAAAAJpFgAgAAAGASCSYAAAAAJpFgAgAAAGASCSYAAAAAJpFgAgAAAGCS/bY7gN3h4ue8bFvKPfB//Oq2lAsAAACwO7mCCQAAAIBJJJgAAAAAmESCCQAAAIBJJJgAAAAAmESCCQAAAIBJJJgAAAAAmESCCQAAAIBJJJgAAAAAmESCCQAAAIBJJJgAAAAAmESCCQAAAIBJJJgAAAAAmESCCQAAAIBJJJgAAAAAmESCCQAAAIBJJJgAAAAAmESCCQAAAIBJJJgAAAAAmESCCQAAAIBJJJgAAAAAmESCCQAAAIBJJJgAAAAAmESCCQAAAIBJJJgAAAAAmESCCQAAAIBJJJgAAAAAmESCCQAAAIBJJJgAAAAAmESCCQAAAIBJtjzBVFX7VtVpVfWm8fqgqvpQVZ1bVa+sqquP8dcYr88b7++YmccTx/hzquqeWx0zAAAAAMvbHVcwPS7JJ2dePy3J01trBye5JMmjxvhHJbmktXbzJE8f06WqbpXkoUl+PMm9kvxdVe27G+IGAAAAYAlbmmCqqpsmuW+SF4zXleSuSU4ckxyX5P5j+LDxOuP9u43pD0vyitbaN1trn05yXpLbb2XcAAAAACxvq69gekaS303y3fH6Bkm+3Fq7dLy+MMlNxvBNknwmScb7XxnTXzZ+zmcAAAAA2GZblmCqql9KclFr7aOzo+dM2tZ5b63PzJZ3VFWdUlWnXHzxxRuOFwAAAIBds5VXMP18kl+uqvOTvCL91rhnJLluVe03prlpks+O4QuT3CxJxvvXSfKl2fFzPnOZ1trzWmuHtNYOOfDAAzd/aQAAAACYa8sSTK21J7bWbtpa25H+kO53tdYenuSkJIePyY5I8vox/IbxOuP9d7XW2hj/0PErcwclOTjJh7cqbgAAAAA2Zr/1J9l0v5fkFVX1lCSnJXnhGP/CJC+tqvPSr1x6aJK01s6qqhOSfCLJpUke21r7zu4PGwAAAIB5dkuCqbV2cpKTx/CnMudX4Fpr30jyoAWf/7Mkf7Z1EQIAAACwq7b6V+QAAAAAuJKTYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgEgkmAAAAACaRYAIAAABgkv22O4Crqoue+8xtKfeGjzl6W8oFAAAArrxcwQQAAADAJBJMAAAAAEyyZQmmqrpmVX24qs6oqrOq6klj/EFV9aGqOreqXllVVx/jrzFenzfe3zEzryeO8edU1T23KmYAAAAANm4rr2D6ZpK7ttZ+Ksltktyrqu6Q5GlJnt5aOzjJJUkeNaZ/VJJLWms3T/L0MV2q6lZJHprkx5PcK8nfVdW+Wxg3AAAAABuwZQmm1n11vLza+GtJ7prkxDH+uCT3H8OHjdcZ79+tqmqMf0Vr7ZuttU8nOS/J7bcqbgAAAAA2ZkufwVRV+1bV6UkuSvKOJP83yZdba5eOSS5McpMxfJMkn0mS8f5XktxgdvyczwAAAACwzbY0wdRa+05r7TZJbpp+1dEt5002/teC9xaNv5yqOqqqTqmqUy6++OJdDRkAAACADdotvyLXWvtykpOT3CHJdatqv/HWTZN8dgxfmORmSTLev06SL82On/OZ2TKe11o7pLV2yIEHHrgViwEAAADAHFv5K3IHVtV1x/C1ktw9ySeTnJTk8DHZEUleP4bfMF5nvP+u1lob4x86fmXuoCQHJ/nwVsUNAAAAwMbst/4ku+zGSY4bv/i2T5ITWmtvqqpPJHlFVT0lyWlJXjimf2GSl1bVeelXLj00SVprZ1XVCUk+keTSJI9trX1nC+MGAAAAYAO2LMHUWjszyU/PGf+pzPkVuNbaN5I8aMG8/izJn212jAAAAABMt1uewQQAAADAlZcEEwAAAACTSDABAAAAMIkEEwAAAACTSDABAAAAMIkEEwAAAACT7LfdAbDn+Oyzf3tbyv2Bx/7NtpQLAAAAbA5XMAEAAAAwiQQTAAAAAJNIMAEAAAAwiQQTAAAAAJNIMAEAAAAwiQQTAAAAAJNIMAEAAAAwiQQTAAAAAJMslWCqqp9fZhwAAAAAVz3LXsH0rCXHAQAAAHAVs99ab1bVzyW5Y5IDq+q3Z976viT7bmVgAAAAAOwd1kwwJbl6kv3HdNeeGf8fSQ7fqqAAAAAA2HusmWBqrb07ybur6tjW2gW7KSYAAAAA9iLrXcG04hpV9bwkO2Y/01q761YEBQAAAMDeY9kE06uSPDfJC5J8Z+vCAQAAAGBvs2yC6dLW2nO2NBIAAAAA9kr7LDndG6vqN6rqxlV1/ZW/LY0MAAAAgL3CslcwHTH+/87MuJbkhzc3HAAAAAD2NkslmFprB211IAAAAADsnZZKMFXVI+aNb629ZHPDAQAAAGBvs+wtcrebGb5mkrslOTWJBBMAAADAVdyyt8j95uzrqrpOkpduSUSwytnPPmy3l3mLx75+t5cJAAAAe6tlf0Vuta8lOXgzAwEAAABg77TsM5jemP6rcUmyb5JbJjlhq4ICAAAAYO+x7DOY/npm+NIkF7TWLtyCeAAAAADYyyx1i1xr7d1Jzk5y7STXS/KtrQwKAAAAgL3HUgmmqnpwkg8neVCSByf5UFUdvpWBAQAAALB3WPYWuT9IcrvW2kVJUlUHJvmnJCduVWAAAAAA7B2W/RW5fVaSS8MXN/BZAAAAAK7Elr2C6a1V9bYkLx+vH5LkLVsTEgAAAAB7kzUTTFV18yQ3aq39TlX9SpI7JakkH0hy/G6IDwAAAIA93Hq3uT0jyX8mSWvtNa21326t/a/0q5eesdXBAQAAALDnWy/BtKO1dubqka21U5Ls2JKIAAAAANirrJdguuYa711rMwMBAAAAYO+0XoLpI1X131ePrKpHJfno1oQEAAAAwN5kvV+R+60kr62qh2dnQumQJFdP8oCtDAwAAACAvcOaCabW2ueT3LGq7pLk1mP0m1tr79ryyAAAAADYK6x3BVOSpLV2UpKTtjgWAAAAAPZC6z2DCQAAAADWJMEEAAAAwCRblmCqqptV1UlV9cmqOquqHjfGX7+q3lFV547/1xvjq6qeWVXnVdWZVfUzM/M6Ykx/blUdsVUxAwAAALBxW3kF06VJ/ndr7ZZJ7pDksVV1qyRPSPLO1trBSd45XifJvZMcPP6OSvKcpCekkvxJkp9Ncvskf7KSlAIAAABg+21Zgqm19rnW2qlj+D+TfDLJTZIcluS4MdlxSe4/hg9L8pLWfTDJdavqxknumeQdrbUvtdYuSfKOJPfaqrgBAAAA2Jjd8gymqtqR5KeTfCjJjVprn0t6EirJDcdkN0nymZmPXTjGLRoPAAAAwB5gyxNMVbV/klcn+a3W2n+sNemccW2N8avLOaqqTqmqUy6++OJdCxYAAACADdvSBFNVXS09uXR8a+01Y/Tnx61vGf8vGuMvTHKzmY/fNMln1xh/Oa2157XWDmmtHXLggQdu7oIAAAAAsNB+WzXjqqokL0zyydba38y89YYkRyR56vj/+pnx/7OqXpH+QO+vtNY+V1VvS/LnMw/2vkeSJ25V3LCMDzzvl7al3J876k3bUi4AAACsZcsSTEl+PsmvJflYVZ0+xv1+emLphKp6VJJ/TfKg8d5bktwnyXlJvpbkyCRprX2pqv40yUfGdE9urX1pC+MGAAAAYAO2LMHUWntf5j8/KUnuNmf6luSxC+b1oiQv2rzo4MrnbS+8z7aUe89HvWVbygUAAGDPsVt+RQ4AAACAKy8JJgAAAAAmkWACAAAAYBIJJgAAAAAm2cpfkQOu4k588b22pdzDj3zrtpQLAABwVeUKJgAAAAAmkWACAAAAYBIJJgAAAAAmkWACAAAAYBIJJgAAAAAmkWACAAAAYBIJJgAAAAAmkWACAAAAYBIJJgAAAAAmkWACAAAAYBIJJgAAAAAmkWACAAAAYBIJJgAAAAAmkWACAAAAYBIJJgAAAAAmkWACAAAAYBIJJgAAAAAmkWACAAAAYBIJJgAAAAAmkWACAAAAYBIJJgAAAAAmkWACAAAAYJL9tjsAgN3pxcfdY1vKPfKIt6/5/jOPv+duimSnox/+tt1eJgAAcOUkwQTAXMecsPuTXklyzIMlvgAAYG/jFjkAAAAAJpFgAgAAAGASCSYAAAAAJpFgAgAAAGASCSYAAAAAJpFgAgAAAGASCSYAAAAAJpFgAgAAAGASCSYAAAAAJtlvuwMAgGUd+dp7bUu5L37AW7elXAAA2Fu4ggkAAACASVzBBAAT3Pv1j92Wcv/xsGcvfO8+r33Kboxkp7c84A+3pVwAALafK5gAAAAAmESCCQAAAIBJJJgAAAAAmMQzmACA3eK+r3nmbi/zzb9y9G4vEwDgqsgVTAAAAABMIsEEAAAAwCQSTAAAAABMsmXPYKqqFyX5pSQXtdZuPcZdP8krk+xIcn6SB7fWLqmqSvJ/ktwnydeSPLK1dur4zBFJ/nDM9imtteO2KmYA4Krlvq9+wbaU++YHPnpbygUA2Cpb+ZDvY5P8bZKXzIx7QpJ3ttaeWlVPGK9/L8m9kxw8/n42yXOS/OxISP1JkkOStCQfrao3tNYu2cK4AQC2zS+dePy2lPumwx++LeUCAFcOW5Zgaq29p6p2rBp9WJJDx/BxSU5OTzAdluQlrbWW5INVdd2quvGY9h2ttS8lSVW9I8m9krx8q+IGAODy7nfia7al3Dce/ivbUi4AsHG7+xlMN2qtfS5Jxv8bjvE3SfKZmekuHOMWjQcAAABgD7GVt8htRM0Z19YYf8UZVB2V5Kgk+cEf/MHNiwwAgD3OYSe+dVvKff3h99qWcgFgT7e7r2D6/Lj1LeP/RWP8hUluNjPdTZN8do3xV9Bae15r7ZDW2iEHHnjgpgcOAAAAwHy7O8H0hiRHjOEjkrx+ZvwjqrtDkq+MW+jeluQeVXW9qrpeknuMcQAAAADsIbbsFrmqenn6Q7oPqKoL038N7qlJTqiqRyX51yQPGpO/Jcl9kpyX5GtJjkyS1tqXqupPk3xkTPfklQd+AwAAALBn2MpfkXvYgrfuNmfaluSxC+bzoiQv2sTQAAAAANhEe8pDvgEAYK/3gFe/b7eX+doH3mm3lwkAq0kwAQDAldiDXn3mtpT7qgf+5LaUC8D2kGACAAB2q6Nf+5ltKfeZD7jZ+hMBsEt296/IAQAAAHAlI8EEAAAAwCQSTAAAAABM4hlMAADAVd6zX/v5bSn3sQ+40baUC7DZJJgAAAD2QK858QvbUu6vHH7AtpQL7N0kmAAAAFjaScdfvNvLvMvDD9ztZQIbI8EEAADAXu20F1y0LeX+9KNvuPC985/x77sxkp12/Nb3b0u5IMEEAAAAVwH//tfnbUu53//4m29LuexefkUOAAAAgElcwQQAAABsi88/46PbUu6Nfuu221LulZkEEwAAAMCMzz/z5N1e5o2OPnTN9y969ht3TyCr3PCx91tqOrfIAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADDJXpNgqqp7VdU5VXVeVT1hu+MBAAAAoNsrEkxVtW+SZye5d5JbJXlYVd1qe6MCAAAAINlLEkxJbp/kvNbap1pr30ryiiSHbXNMAAAAAGTvSTDdJMlnZl5fOMYBAAAAsM2qtbbdMayrqh6U5J6ttUeP17+W5Pattd+cmeaoJEeNlz+W5JxNKv6AJF/YpHltFjEtb0+MS0zLEdPy9sS4xLQcMS1vT4xLTMsR0/L2xLjEtBwxLW9PjEtMyxHT8vbEuDYrph9qrR047439NmHmu8OFSW428/qmST47O0Fr7XlJnrfZBVfVKa21QzZ7vlOIaXl7YlxiWo6YlrcnxiWm5YhpeXtiXGJajpiWtyfGJabliGl5e2JcYlqOmJa3J8a1O2LaW26R+0iSg6vqoKq6epKHJnnDNscEAAAAQPaSK5haa5dW1f9M8rYk+yZ5UWvtrG0OCwAAAIDsJQmmJGmtvSXJW7ah6E2/7W4TiGl5e2JcYlqOmJa3J8YlpuWIaXl7YlxiWo6YlrcnxiWm5YhpeXtiXGJajpiWtyfGteUx7RUP+QYAAABgz7W3PIMJAAAAgD2UBNMCVfXkqrr7dsexoqp2VNX/s4nzu39V3Wqz5renq6pDq+pNG5j+NlV1n00od9PXc1V9dRc/d2xVHb6ZsWymqnpLVV13u+PYbFV1clUdMobfUlXXHX+/sfJ/4vwfWVV/uznRLixj2+OsqqOr6pNVdfyC9yfHuItxHVNVj9/d5c6J4/yqOmC745hiT1mXqy1R9y47vkyp59ux/CPeH5h5/YItOGat27ZX1e9vZplT7al9pF09/m9S2XtUv3g9e1KbuPr4tNE+6S6Ud2hV3XHm9WOq6hGbXMa2HHM3YmV/qaofqKoTx/CG2ujd1MfaUVUf38XPbvm2npn3Lq2L1TFupa1c/t1lavuwJ7R9EkxzVNW+rbU/bq3903bHMmNHkrkJpqralWdp3T/J3M5TdVeaurGL6+c2SSYnmLLGel5kF+Pdq1VVJfml1tqXtzuWrdRau89Yxusm+Y2Z/5dTVftuVQy7WL92e5xz/EaS+7TWHr7g/bkxrmc3L8OVylVo3a1X9/Zmj0xyWYKptfbo1tonNmPGK32JmXZvLXtUgim7cOzem+zKvrsH9ov3Jrt0fJrg0CSXfaFvrT23tfaSTS5jdy/TLmutfba1tttPsO6mY+Sh2fptPdWhmYlxK+2hy7+mK2VfqrV2lfpLT9ScneS4JGcmOTHJ9yQ5P8kfJ3lfkocmOTbJ4eMzt0vyz0nOSPLhJNdO/zW7v0rykTGf/3edch8xpjsjyUuT/FCSd45x70zyg2O6Y5M8c5T3qZkYPpjkK0lOT/K/0juFr0ryxiTvSrL/mM+pST6W5LA1yr5jki8l+fSY34+M9fLJJH+X5LReNS77/OFJjp2J77lJ3pvkX9KTAllrfST5nZnxTxrjvjfJm0dMH0/ykDW21QvGNMcnuXuS9yc5N8ntx98/j5j/OcmPjc+uXj+HJnnTzPY8LckPjzheNOI7LclhSa6e5F+TXDzWz0NWxfTJJM9PclaStye5VpL/PuZxRpJXp9epeev55CSHjHkdkOT8BfFeYXsm+dX0+vedJH+fXofOHfPZZ2yTe8zb5uvUrbl1Z9GyzqzDM5N8IH27f3y9erBg+16uzo1lmVs3ktw2ybuTfDT9FyVvvIv713OSnDTWwZ3H9v9kRh0f0301ydNGWf+UXs9OHp/55THNNZO8eKyz05LcZYy/VpJXjLJfmeRDM9v8/LGMr0jy9SSXJPl2ej05d8T1D+n7+i8ned2I4awkR83Ed2T6/vfusX3+dow/ML3+fWT8/fwYf0z6Q/3ePub/4+l16fQR58HrtF8r8Z4+5rsS5yfG+5sS5xrlPzfJt8a6/kqSx8+89/H0ujQb419lZp8f0/1tkkfObIfZ9v5Hkrx1LMN7k9xiTPegMf8zkrxnZl5/kOScUTdenuTxa8zj2CxX556T5JSxDp80M/78JE/Kzv1zZb43GNvztPT24IIkB6yxDv80yeNmXv9ZkqMzp31eZ5t+NcmT0+v1nSYej1fvt8esbNv0JP8Hx/uvTXK9Mf7oJJ8Y418xczy5XDs+Ja416t7vZf7x5rK6lt6W/+0G5j+vLi1a9pOTPD3Je0b9uV2S16S3HU+ZmefKseL0UTf2HX/Hptfnj6X3Iw4f2/OcMe21cvlj1L1Gvbsoydkz8z8myZ9k7ePGSrv+Qxnt3hqxPTX9uHZ6+nF+bl3dwDq9wjFkxPC0UfaHk9x8THuFY0XmHLs3qz6tsQ6+OpbzjLHtbzSmPSj9OPuRsV6+usT8d2S5fu4V2qwk1xnT7TPm9T1JPpPkarl8v/huY/t+LH3fu8ZMe7WyrQ9JcvIYvvNY3tPH5669xnaa9/lj0tuId6XX9/8+s++9J30/+UT6/rrPbCwbrU9Zru95/fQ28syxvX5yJs4XZWd/4egxft7x6eSxbc4eZdQS2/YK7XJ27qdnpNfhHUn+Pcm/jfJ+YcT1+CS3TPLhVct65hhe2MdaEMvqZfqr7GxfZvvNvzvGnZHkqeu07ydniTZuA/vaV2eWc6Wf+sjs7IvcN33/OiCL+0+PTO8/XDu9TbjaGH/r9P7bS7L2frZoWW871snqfvRl8Y3Xb0py6Ea39Zz1/NkxzVnpbf7T0tvdLyY5L5dvd+6Xfow/Lf3YdKN11sX3jeW+WlYdoxfEOLdvtcZ2/KP0/eQd2XmcvML3rpl9cGX5T87Odv9fkvzCLtShdfe5MW7/7PxOcGaSB47xDxvjPp7kabN1MzN9qTHPs9PrzTMz039dJ75F7ei8fuOidutj6QnjGvXhEWP8S5PcfaPrrLV2lU0wtexsOF40Kur5SX53Zrpj0ztfV08/SNxuZifaL8lRSf5wjLtG+heDgxaU+ePpO/PKQfP66YmEI8brX0/yuplyX5WeMLhVkvPG+ENz+S9Lj0xyYZLrj9f7Jfm+MXxAemNR88qeXb5V6+W7Se6wUvFn3ludYHrriO/gEcM1F62PJPdI/2Jb4zNvSvKLSR6Y5PkzZVxnwba6NMlPjM9+dGyvSk+4vG5le4zp757k1QvWz6Gj7DuO+awkHP48ya+O4eumN0DfmwVfEmZius14fUJ6Z/EGM9M8JclvLljPJ2dxgmmt7XlBep25Wnqj9HfpX84enX5Q+50kf7+ovq1TtxbVnbnLOoY/nuSOY/ip2Xlg3Mh+sSOXr3Pnj/KvUDfGcv9zkgPHuIckedEu7l+vyM469B+5fP1aWdaW5N5j+LXpX+SvluSnkpw+xv/vJC8ew7dIT0peM8lvz8T2k2Mdrk4w7cjOxMjH0zvf70vyX+Mznx7bZWXbXWtMd4MkNx5lHZjePr0/OztL/5DxpT/9i9Inx/AxY/lWEoTPSvLwMXz1lfHrtJsr2/jQEedBM+9vSpzrxLCy7o7J/ATTZTEuaDNXJ5hm2/t3ZiTZkvxskneN4Y8luclK+zD+33aM/5709ue89GPIonkcm+Xq3Mo63De9nfjJmVhX2pPfSPKCMfzMJH88hu+bkaBdZxueOob3SfJ/0/ejK7TPi7bpzL7x4PW21xLbc95+e9m2Te8A3XkMPznJM8bwZ7Pzy+zKNpnbjk+NcU7dW3S8uayuZQMJpjXq0qJlPzmjg5rkcWNd3Di9rb0wfb+7ZcaxYky3cqy4bZJ3zJR93Zl5HjIz/uT0L/YHpicWDkry00nePzPNJ9L320XHjcva9VXrb25sY3i2z7EjV6yrN1hmnY7PzDuGnJ/kD8brR8xsr7WOFYcvW+YGYlu0fVqS+41xf5mdx9E3zKyjx2b5BFPL+v3cRW3W67PzhMlDsrPNOTa9P3jNUTd+dIx/SZLfmt3WY3g2QfTGmXj2Tz++LdpOixJMZ6S3RweM8n8gfd/7RvoJw33Tv4AePhvLRutTlut7PivJn4zp75qd/YJj0vsq1xhlfzG977AjVzw+fSXJTUcZH8gSCftcsV2+0VgXB616/5hc/jh52ev0L/k/PIZ/L8kfZo0+1jrraaVf8MCx7vcdMf1rett07zHf71kV3y63cRvc3xYmmJI8ID2xupLwWdR/emR29l1enOT+Y/iJWW4/W7Sss+PXTTBlpk3ewLaeLeMvkzwjve78V5Jnj/iPSU8izbY718vOHwJ7dJL/b511cdTMNPOO0atjvELfao1teEh2ngC5dnqy8fFZ/L1rdvlPnonrPkn+aSP1Z4P73NNWtu3MOvyB7OwH75eeIF9ZZ5f1pbKzTT04vZ05IcsnmBa1o/P6jYvareem9yNvnZ60e/4Yf26S/Te6zlprucrdijN8prX2/jH8svRsa9KvNljtx5J8rrX2kSRprf1HklTVPZL85Mwzba6TXjE+PWced01yYmvtC2MeX6qqn0vyK+P9l6bv2Cte11r7bpJPVNWN1liOd7TWvu8HefUAABGsSURBVDSGK8mfV9UvpnfubpK+E1yh7DXmd0Fr7YNrvL/ihBHfuVX1qfQv14vWxz3G32lj/P5j/HuT/HVVPS19J3rvgrI+3Vr7WJJU1VnpmeJWVR9LP2BcJ8lxVXVw+s56tZnPvmPV8t4y/cvUPVprnx3j7pHkl2eee3HN9APLWj7dWjt9DH90xHHrqnpK+peb/dPP/mzUWttz5QD7kfRG7m5JLmqtHVNVD0rymPQzFcna23xe3VpUd+Yu63iWxrVba/88xv9Dkl8awxvZL5L5de5jWVU3qurW6Q3fO/rddNk3yecWLe86+9cbZ+rQ51fVrx3pB7JvpSdSV+L5Zmvt2zP1LulnHJ41yjy7qi5I8qPpCdRnjvFnVtWZC5b9Mq21d1fVTdL3kzumf3G9dDz75QFjspulr8vvT+90XzzifuUoN+lfem811lGSfF9VXXsMv6G19vUx/IEkf1BVN03ymtbauevFuMqHW2uz23RT4myt/ecG45jilSOu/dPX+atm4rnG+P/+JMdW1QnpZ1GTfvbtta21r43PvyG93Vg0j2S5OvfgqjoqvRNy4/Qk8ErdWSn7o9lZr39xZbi19uaqumSthW2tnV9VX6yqn07fv09LPzs8r31+T+Zv0y+mn/F89VplLWnefpskqarrpHc63z2mPS49OZ70dXJ8Vb0u/Ytesrgd/+QmxDlrrePNrphXl743i5c96QmHpLdLZ7XWPjc++6n07XSn9GTSR8b6vFb6FUhvTPLDVfWs9LOdb18ntjukn1n+9Jj/9as/q+nA9CsvP5fk6QuOG4v6EndbENvlzKurrbUvrhPvrHnHkKSf+V75//QxvNaxYissWgffSv8imfT9/L+N4Z9P/wKxEt/TlixnzX7uOu3eK9MTDCelX4Hxd6vm/WPpfYN/Ga+PS09+PWONeN6f5G+qP8fsNa21C0d7OG87LfL6cQz7elWdlH4l0ZfTj0efGsv18vR94MSVD+1ifVqv7/lDGdultfauqrrBaLeS5M2ttW8m+WZVXZSd+8VqH26tXTjKOH3M933rxLW6XT4qM/vpOn38FSckeXD6ycGHjL8fy+I+1jLulOTlrbXvJPl8Vb07/fhy5/QTcV9biW+d9j1Zv43bSFuwyF3SExf3WPlel7X7TytekH5F1uvSr8L593X2s7nLOmf8S9OTcWu5XJu83raeU8Y10q/6ucsYPiO93XlO+ony52dnu3PTJK+sqhunnyCc14efXRdHjnkn84/Rq83rWy1yp+zc91NVbxzjl/3eNdt/2rFOWfMsu8/dPb29zBh/yTg+zvaDj0/vu70ul+9L3SK9zTl3TPeyUc4yFrWj8/qNd8r8duu9I64L0uvDUeM7yZdaa7v03L+raoKpLXj9X3OmrTnTr4z/zdbaMomERfNYFNM3V312kdl4H57e8bvt+CJ8fnone5my581v9jPXXCPWlddz10dV3TPJX7TW/n51YVV12/SM8l9U1dtba0+eE9PsuvjuzOvvptffP01yUmvtAVW1Iz1bPW95kn6wvGb62diVBFOlX8Z4zqrYfnZOLPNi+k56B/HY9Kz0GVX1yPSzDfNcmp3PPlu9Xtfanl9M75T9RlV9tbX2YyPO70k/ECS9gf3PrL3N59WtRXVn0bKuVSc3sl8kc/a51tq/rK4b6VcRndVa+7kFZe7K/jVbn1Zer7SJ326ttdXTtda+WzufYbTWelh2n5v19vTG/8gkv15Vh6YfsH6utfa1qjo5O7fLovnvM6b/+uzIcbC5bF231v6hqj6UfsbibVX16NbauzYQ62Xz2sw4lzS7DyVX3I+WnW5lGfZJ8uXW2m1WvZ/W2mNGW3DfJKdX1co0q5dr4TyGNetcVR2UfkbudqNTcuyqeFc+851c/ri90Xr2gvQzkN+ffrb1bpnTPq+zTb8xvkRMtZFj06z7pneEfjnJH1XVj2dBO74F1jre7KqNroP12q9Kclxr7YmrP1hVP5XknunJgAenX7GzyOrtc2L61Svfn35F3lrHjXl9qZV5zo1tjtV1dWkLjiHJ5Zdn0XrflTq5EXPXQVU9fuaYM3U/n/eZ1f3ctdqsN6Svt+unJ8NWHxvWOvbN7eO01p5aVW9O3yYfrKq7jxMzq7fTWn2kRcu0aPysjdan9fqel875zEq5q/tNi75rLTtdkoXt8hnpyaGNeGV6kuM16Y/DOLeqfiKL+1jLWFQndqWdX6aPNtWn0q96+9H0K+2TtftPSZLW2vurakdV3XlM/61V813r++TlZpvF62VR32VXj5krdefnk5zTWvvpqvrymN+3s7PuzdbBZyX5m9baG8Znj1k9z1XrYt/W2spDyucdo1d/9gp9qzWSvovq1rFZ7nvXov7Tuja4z83bPmu1lav7Uru0bdc43s1b7nnxtPSTio9NPzH3B+lX9x2ennjaJVeaBzlv0A9Wv8Ih6fdGrnXG4OwkP1BVt0uSqrr2+IL5tiT/o6quNsb/aFV974J5vDP97PQNxrTXT79kdCXT+fB1Ykh64mB1Jn3WddKvaPl2Vd0l/ezKorKXmd/nq+qW1R/2/YBV7z2oqvapqh9Jb6DPyeL18bb0L8v7j/E3qaobVj8T+rXW2suS/HWSn1ln+dda7n8bw49cZ9ovpzdmfz4ajYz4frPGEWSc4UrWXz+rXTvJ58byzz4IdvV8zk/vsCV9511k9fa8fpJ7V9UNR5zXr6ofSj+beXz6/d7PH59dtM2XLeuH1pq4tXZJkv+sqjuMUQ+deXsj+8VcC+rGOUkOXNlvq+pqMwetzdi/dsV7xrxTVT+a3jCfs2r8rdNveVttpV7M1o+3pt+OktbaWenb5ZJxULtF+tmrpN+vfeg483C19LNoK96e5H+uvJhJiFxOVf1wkk+11p6Z/mViXozz4p1ny+Jc4PyM9qKqfiZjnc2J8YL0s5HXGGdo7jZvZuPs5aerXwm48mDinxrDP9Ja+1Br7Y+TfCH97NV7kjygqq5V/ezm/ZJ8bdE8lvR96R3Sr1S/snC9M5nJ5evZvdMvx17Pa9Pv879d+r46t33O4m26mRa2U621ryS5pKp+YYz6tSTvHsejm7XWTko/czp75nJeO77ZNnK8Wca8uvRfmbPsG5jnO5McvvpYUf0XZfZprb06/XkWK8fcRfv2B5LcuXryM0nekt6mHp6ebNrQcWOt2MZ73145bgyr6+rS1uhfPGTm/wfG8KJjxUb7AMtaax3M8/5V8S1rzX7uWu3eOGP94ST/J/2M+OqE8tnpVzPffLyeraPnZ2cfZ+XKq5W29GOttaelf6G/xYLtNPfzw2FVdc3RZhyafkV3kty+qg4a7cNDVi/rsMv1aYHZ9vfQJF+YuRJmns2oT/Pa5WtkZj9dpo/fWvu/6V86/yg779xYq4+1yGwZ70nykKrat6oOTE8wfDj9WP/r1U+Gpqquv6h9X3YlbJIL0q/qeMnMci7bL3lJ+lWQr8r6+9ncZW39Rw++UlV3GuNn9+3zk9xmfM+6WfqVesmqNnm9bb2q7OuMaU4edef7Fizbitlj3RFrTLeyLl48Ylp0jL5cjAv6Vou8L8n9xr6/f/r3uGTx967NtJF9bnX9uV56P/jOVXVA9Qd5Pyzz6/rZSQ6q/r06Y7qlrHG8m2duu9Va+0z6Lb0Ht3416PvST3rucoLpqnoF0yeTHFFVf59+f+FzkvzmvAlba9+qqockeVZVXSv9gXZ3Tz8bsiPJqaNTe3H6r47Mm8dZVfVn6R3k76TfjnB0khdV1e+Mzx65TsxnJrm0qs5Iz9quvh3i+CRvrKpT0m+3OHuNsh+Zfgby+VV1dHqHcXUH4gnpl2t/Jv2e0/1n3jsnfQe5UZLHtNa+UVVz10dr7e1VdcskHxh9/6+mX4p58yR/VVXfTc+g/491ln+Rv0y/ZeG3c8WzbFfQWvt8Vd0vyT9W1a+nn5F+RpIzR9znp9/udVKSJ1S/bPkvWmvzbp+c9UfpDckF6ZcrrjSkq9fzXyc5oap+bZ14523PZ2fnQ8Xfkf6sn9ul3//9nap6YFUd2Vp78YJtvpGy1vOosVz/lX4W/ytj/NL7xRp+IqvqxtgPD0/yzOrJgv3St9tZm7R/7Yq/S/Lc6pfMX5r+fJ9vVtVzkry4+q1xp6d3si6ntfbFqnp/+n70X9V/nvbr6fvHi8dkb03ymDGfc9IfyJfW2ueq6pj0zsbn0h/it/ILFEcnefb4zH7pB5PHzIn9IUl+taq+nf7wxXlXD14h3pk4Pz/z9lbGOc+rkzxi7JsfSX/ezuoY/7G19jvVL78+M72dP23hHPsB9zlVtfIsilekn6X6q+q3Q1X6F8MzWmut+u1+p6fv7+9dZx7rGmfgTkt/iOSn0r9UrudJSV5eVaem16N/XaKcb1W/teTL40vjovZ57jbdTAv22/NnJjkiff/6nvR1cmR6/XnZaAMqydNba1+uqkXt+Gbb0PFmPa21UxfUpXnLvuw8PzHq4NtHZ//b6Wcmv57eLq2cWFy5gubYUdbX028XW5nPxdVv2XzN+MxF6ce1fxv79oaPG2vEdkH67etnVtWprbWHz6mrG3GFY0h6Uuwa1a/c3Cc7O/CLjhWXO3aPL+WTrbEOFnlckn+oqsdlY7emLtPPXavNemX6F+hD5yzDN6rqyPSrYPZLb4efO95+UpIXVtXvp/eJVvxW9UTkd9Kf4fWPY96rt9O1Fnw+6cfSN6efzPnT1tpnq5/c+UD67V4/kZ0P/F4d85T6NM8x2Xmc/1rW/hJ+hePTWI6NmtcuX5x+K83sfvrf0m+JPbGqDsv87zevTH/uz8pJrYV9rA0s08oPNrT0ZxD9e5K3Vk/UnFJV30pPVP9+JrRxm6W1dk5VPTy9Ht8vy/dLjk9/5s8b0/eh9fazRct6ZHrb87VcPun5/vRb0lYeDH3qiHdem7zetj4ifd/83vRkzz3S73pYKxma9Pr9qqr6t/R6dtCC6VbWxcrtx4uO0atj/F+r+1aLAmmtfaT67eNnpB8rTkn/zrHoe9dm2sg+95T0+vPx9HbuSa2111TVE9O/U1aSt7TWXj9nGb8xtu2bq+oL6QmeWy8Z46Lj3TzHZHG79aHs7KO/N8lfZMLJ+ZUHeF1lVL+s/U2ttWU3HDOq37rxptbaosrLVUBV7T/OcqaqnpD+ayOP2+aw9mqj8/GxJD8zzjzBphqdoVOTPKht/LlbsNtsdl2tfgvfIW088+vK7MrYzx0nK77aWvvrVeMPTX+g75oJZW0fm2Uk4g5LT3BcqfazjVpZF621X9vicvZvrX119JPfk/5rbqduZZlMc1W9ggmY5r4jK79f+tmDR25vOHu3qrp7+nMh/kZyia1QVbdKvyr1tb5gsSdTV9lM6hObpfqPJNw7/Xk3V2m7eV08b+zH10x/hp3k0h7uKncFEwAAAACb66r6kG8AAAAANokEEwAAAACTSDABAAAAMIkEEwDAHq6qDq2qN213HAAAi0gwAQDsYapq3+2OAQBgIySYAAA2UVX9blUdPYafXlXvGsN3q6qXVdXDqupjVfXxqnrazOe+WlVPrqoPJfm5qrpXVZ1dVe9L8ivbszQAAMuRYAIA2FzvSfILY/iQJPtX1dWS3CnJuUmeluSuSW6T5HZVdf8x7fcm+Xhr7WeTnJLk+UnuN+b1/bsvfACAjZNgAgDYXB9NctuqunaSbyb5QHqi6ReSfDnJya21i1trlyY5Pskvjs99J8mrx/Atkny6tXZua60lednuXAAAgI2SYAIA2ESttW8nOT/JkUn+Ocl7k9wlyY8k+dc1PvqN1tp3Zme1VTECAGw2CSYAgM33niSPH//fm+QxSU5P8sEkd66qA8aDvB+W5N1zPn92koOq6kfG64dtfcgAALtOggkAYPO9N8mNk3ygtfb5JN9I8t7W2ueSPDHJSUnOSHJqa+31qz/cWvtGkqOSvHk85PuC3RY5AMAuqH5bPwAAAADsGlcwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk0gwAQAAADCJBBMAAAAAk/z/pVKdj0cTqZUAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "freq_words(documents['preprocess_join'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "-- filter tokens less than 15 documents\n", + "-- keep only 100000 words" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 establish\n", + "1 industries\n", + "2 largest\n", + "3 natural\n", + "4 procurement\n", + "5 singapore\n" + ] + } + ], + "source": [ + "dictionary = gensim.corpora.Dictionary(processed_docs)\n", + "\n", + "count = 0\n", + "for k, v in dictionary.iteritems():\n", + " print(k, v)\n", + " count += 1\n", + " if count > 5:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Gensim doc2bow-- Creating dictionary from the data" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Word 1 (\"largest\") appears 1 time.\n", + "Word 2 (\"natural\") appears 1 time.\n", + "Word 4 (\"trade\") appears 1 time.\n", + "Word 27 (\"futures\") appears 1 time.\n", + "Word 79 (\"analysts\") appears 1 time.\n", + "Word 89 (\"consumer\") appears 1 time.\n", + "Word 130 (\"tocom\") appears 1 time.\n", + "Word 142 (\"expect\") appears 1 time.\n", + "Word 143 (\"higher\") appears 1 time.\n", + "Word 161 (\"economy\") appears 1 time.\n", + "Word 382 (\"release\") appears 1 time.\n", + "Word 572 (\"sign\") appears 1 time.\n", + "Word 621 (\"recovery\") appears 1 time.\n" + ] + } + ], + "source": [ + "bow_doc_4310 = bow_corpus[4310]\n", + "for i in range(len(bow_doc_4310)):\n", + " print(\"Word {} (\\\"{}\\\") appears {} time.\".format(bow_doc_4310[i][0], \n", + " dictionary[bow_doc_4310[i][0]], \n", + " bow_doc_4310[i][1]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving pickle files" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "pickle.dump(bow_corpus, open('corpus.pkl', 'wb'))\n", + "dictionary.save('dictionary.gensim')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LDA" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [], + "source": [ + "lda_model_batch = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,\n", + " id2word=dictionary,\n", + " num_topics=20, \n", + " random_state=100,\n", + " update_every=1,\n", + " chunksize=100,\n", + " passes=10,\n", + " alpha='auto',\n", + " per_word_topics=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [], + "source": [ + "lda_model_batch.save('model_lda.gensim')" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, '0.261*\"state\" + 0.181*\"continue\" + 0.156*\"increase\" + 0.090*\"cheaper\" + 0.083*\"research\" + 0.049*\"recent\" + 0.045*\"worry\" + 0.044*\"flat\" + 0.027*\"economic\" + 0.027*\"tepid\"'), (1, '0.308*\"produce\" + 0.138*\"face\" + 0.098*\"plantations\" + 0.087*\"group\" + 0.079*\"producers\" + 0.075*\"plan\" + 0.071*\"current\" + 0.048*\"vietnam\" + 0.033*\"latex\" + 0.018*\"expand\"'), (2, '0.201*\"contract\" + 0.155*\"exchange\" + 0.121*\"commodity\" + 0.074*\"rupees\" + 0.060*\"week\" + 0.058*\"gain\" + 0.054*\"benchmark\" + 0.046*\"higher\" + 0.043*\"track\" + 0.033*\"futures\"'), (3, '0.235*\"rupees\" + 0.124*\"widely\" + 0.113*\"rupee\" + 0.088*\"unchanged\" + 0.081*\"sell\" + 0.071*\"variety\" + 0.067*\"show\" + 0.059*\"trade\" + 0.044*\"traders\" + 0.036*\"cap\"'), (4, '0.298*\"analysts\" + 0.111*\"lower\" + 0.084*\"global\" + 0.078*\"largest\" + 0.042*\"investors\" + 0.041*\"world\" + 0.037*\"consumer\" + 0.035*\"firm\" + 0.035*\"fall\" + 0.033*\"profit\"'), (5, '0.190*\"see\" + 0.161*\"base\" + 0.121*\"tap\" + 0.086*\"season\" + 0.061*\"alencherry\" + 0.046*\"owner\" + 0.042*\"grow\" + 0.035*\"areas\" + 0.035*\"gain\" + 0.034*\"peak\"'), (6, '0.254*\"close\" + 0.235*\"rupees\" + 0.119*\"previous\" + 0.100*\"bourse\" + 0.096*\"come\" + 0.076*\"contract\" + 0.057*\"active\" + 0.049*\"end\" + 0.009*\"affect\" + 0.004*\"step\"'), (7, '0.327*\"year\" + 0.190*\"outlook\" + 0.185*\"natural\" + 0.125*\"global\" + 0.084*\"association\" + 0.023*\"president\" + 0.021*\"economy\" + 0.014*\"forecast\" + 0.007*\"line\" + 0.007*\"slow\"'), (8, '0.188*\"trader\" + 0.167*\"decline\" + 0.140*\"weigh\" + 0.120*\"base\" + 0.076*\"largely\" + 0.065*\"financial\" + 0.058*\"buy\" + 0.031*\"tyres\" + 0.030*\"boost\" + 0.026*\"bank\"'), (9, '0.205*\"market\" + 0.106*\"natural\" + 0.102*\"spot\" + 0.097*\"domestic\" + 0.083*\"demand\" + 0.082*\"likely\" + 0.074*\"traders\" + 0.070*\"supply\" + 0.047*\"rise\" + 0.027*\"price\"'), (10, '0.383*\"expect\" + 0.234*\"output\" + 0.102*\"estimate\" + 0.068*\"cover\" + 0.054*\"tree\" + 0.043*\"plant\" + 0.030*\"agriculture\" + 0.029*\"ministry\" + 0.021*\"authority\" + 0.011*\"million\"'), (11, '0.333*\"accord\" + 0.257*\"tyre\" + 0.091*\"steady\" + 0.050*\"quote\" + 0.050*\"improve\" + 0.043*\"sector\" + 0.034*\"strength\" + 0.032*\"mix\" + 0.031*\"open\" + 0.026*\"respectively\"'), (12, '0.190*\"icex\" + 0.180*\"remain\" + 0.129*\"account\" + 0.083*\"trend\" + 0.082*\"level\" + 0.063*\"capital\" + 0.061*\"short\" + 0.056*\"resistance\" + 0.051*\"term\" + 0.034*\"director\"'), (13, '0.228*\"production\" + 0.182*\"stock\" + 0.117*\"high\" + 0.081*\"earlier\" + 0.055*\"growth\" + 0.046*\"average\" + 0.040*\"deal\" + 0.037*\"industry\" + 0.034*\"compare\" + 0.028*\"industrial\"'), (14, '0.221*\"trade\" + 0.207*\"follow\" + 0.191*\"monday\" + 0.165*\"compare\" + 0.052*\"add\" + 0.038*\"strong\" + 0.029*\"early\" + 0.025*\"product\" + 0.019*\"ahead\" + 0.015*\"slightly\"'), (15, '0.182*\"crude\" + 0.159*\"futures\" + 0.120*\"tocom\" + 0.093*\"cue\" + 0.055*\"dollar\" + 0.053*\"trade\" + 0.051*\"shanghai\" + 0.046*\"weakness\" + 0.044*\"international\" + 0.042*\"weak\"'), (16, '0.249*\"export\" + 0.218*\"report\" + 0.142*\"consumption\" + 0.103*\"government\" + 0.075*\"place\" + 0.054*\"second\" + 0.047*\"share\" + 0.032*\"farmers\" + 0.016*\"uncertainty\" + 0.013*\"force\"'), (17, '0.569*\"price\" + 0.152*\"rise\" + 0.113*\"fall\" + 0.072*\"fell\" + 0.025*\"variety\" + 0.024*\"sharp\" + 0.015*\"show\" + 0.007*\"prevent\" + 0.005*\"source\" + 0.004*\"water\"'), (18, '0.447*\"import\" + 0.383*\"month\" + 0.067*\"general\" + 0.020*\"statistics\" + 0.016*\"directorate\" + 0.016*\"natural\" + 0.015*\"commercial\" + 0.009*\"show\" + 0.005*\"intelligence\" + 0.003*\"fell\"'), (19, '0.289*\"support\" + 0.146*\"take\" + 0.089*\"near\" + 0.062*\"pressure\" + 0.052*\"begin\" + 0.043*\"plantation\" + 0.037*\"hold\" + 0.037*\"position\" + 0.027*\"result\" + 0.027*\"meet\"')]\n" + ] + } + ], + "source": [ + "print(lda_model_batch.print_topics())\n", + "doc_lda = lda_model_batch[bow_corpus]" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, '0.261*\"state\" + 0.181*\"continue\" + 0.156*\"increase\" + 0.090*\"cheaper\" + 0.083*\"research\"')\n", + "(1, '0.308*\"produce\" + 0.138*\"face\" + 0.098*\"plantations\" + 0.087*\"group\" + 0.079*\"producers\"')\n", + "(2, '0.201*\"contract\" + 0.155*\"exchange\" + 0.121*\"commodity\" + 0.074*\"rupees\" + 0.060*\"week\"')\n", + "(3, '0.235*\"rupees\" + 0.124*\"widely\" + 0.113*\"rupee\" + 0.088*\"unchanged\" + 0.081*\"sell\"')\n", + "(4, '0.298*\"analysts\" + 0.111*\"lower\" + 0.084*\"global\" + 0.078*\"largest\" + 0.042*\"investors\"')\n", + "(5, '0.190*\"see\" + 0.161*\"base\" + 0.121*\"tap\" + 0.086*\"season\" + 0.061*\"alencherry\"')\n", + "(6, '0.254*\"close\" + 0.235*\"rupees\" + 0.119*\"previous\" + 0.100*\"bourse\" + 0.096*\"come\"')\n", + "(7, '0.327*\"year\" + 0.190*\"outlook\" + 0.185*\"natural\" + 0.125*\"global\" + 0.084*\"association\"')\n", + "(8, '0.188*\"trader\" + 0.167*\"decline\" + 0.140*\"weigh\" + 0.120*\"base\" + 0.076*\"largely\"')\n", + "(9, '0.205*\"market\" + 0.106*\"natural\" + 0.102*\"spot\" + 0.097*\"domestic\" + 0.083*\"demand\"')\n", + "(10, '0.383*\"expect\" + 0.234*\"output\" + 0.102*\"estimate\" + 0.068*\"cover\" + 0.054*\"tree\"')\n", + "(11, '0.333*\"accord\" + 0.257*\"tyre\" + 0.091*\"steady\" + 0.050*\"quote\" + 0.050*\"improve\"')\n", + "(12, '0.190*\"icex\" + 0.180*\"remain\" + 0.129*\"account\" + 0.083*\"trend\" + 0.082*\"level\"')\n", + "(13, '0.228*\"production\" + 0.182*\"stock\" + 0.117*\"high\" + 0.081*\"earlier\" + 0.055*\"growth\"')\n", + "(14, '0.221*\"trade\" + 0.207*\"follow\" + 0.191*\"monday\" + 0.165*\"compare\" + 0.052*\"add\"')\n", + "(15, '0.182*\"crude\" + 0.159*\"futures\" + 0.120*\"tocom\" + 0.093*\"cue\" + 0.055*\"dollar\"')\n", + "(16, '0.249*\"export\" + 0.218*\"report\" + 0.142*\"consumption\" + 0.103*\"government\" + 0.075*\"place\"')\n", + "(17, '0.569*\"price\" + 0.152*\"rise\" + 0.113*\"fall\" + 0.072*\"fell\" + 0.025*\"variety\"')\n", + "(18, '0.447*\"import\" + 0.383*\"month\" + 0.067*\"general\" + 0.020*\"statistics\" + 0.016*\"directorate\"')\n", + "(19, '0.289*\"support\" + 0.146*\"take\" + 0.089*\"near\" + 0.062*\"pressure\" + 0.052*\"begin\"')\n" + ] + } + ], + "source": [ + "topics = lda_model_batch.print_topics(num_words=5)\n", + "for topic in topics:\n", + " print(topic)" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, '0.261*\"state\" + 0.181*\"continue\" + 0.156*\"increase\" + 0.090*\"cheaper\" + 0.083*\"research\" + 0.049*\"recent\" + 0.045*\"worry\" + 0.044*\"flat\"')\n", + "(1, '0.308*\"produce\" + 0.138*\"face\" + 0.098*\"plantations\" + 0.087*\"group\" + 0.079*\"producers\" + 0.075*\"plan\" + 0.071*\"current\" + 0.048*\"vietnam\"')\n", + "(2, '0.201*\"contract\" + 0.155*\"exchange\" + 0.121*\"commodity\" + 0.074*\"rupees\" + 0.060*\"week\" + 0.058*\"gain\" + 0.054*\"benchmark\" + 0.046*\"higher\"')\n", + "(3, '0.235*\"rupees\" + 0.124*\"widely\" + 0.113*\"rupee\" + 0.088*\"unchanged\" + 0.081*\"sell\" + 0.071*\"variety\" + 0.067*\"show\" + 0.059*\"trade\"')\n", + "(4, '0.298*\"analysts\" + 0.111*\"lower\" + 0.084*\"global\" + 0.078*\"largest\" + 0.042*\"investors\" + 0.041*\"world\" + 0.037*\"consumer\" + 0.035*\"firm\"')\n", + "(5, '0.190*\"see\" + 0.161*\"base\" + 0.121*\"tap\" + 0.086*\"season\" + 0.061*\"alencherry\" + 0.046*\"owner\" + 0.042*\"grow\" + 0.035*\"areas\"')\n", + "(6, '0.254*\"close\" + 0.235*\"rupees\" + 0.119*\"previous\" + 0.100*\"bourse\" + 0.096*\"come\" + 0.076*\"contract\" + 0.057*\"active\" + 0.049*\"end\"')\n", + "(7, '0.327*\"year\" + 0.190*\"outlook\" + 0.185*\"natural\" + 0.125*\"global\" + 0.084*\"association\" + 0.023*\"president\" + 0.021*\"economy\" + 0.014*\"forecast\"')\n", + "(8, '0.188*\"trader\" + 0.167*\"decline\" + 0.140*\"weigh\" + 0.120*\"base\" + 0.076*\"largely\" + 0.065*\"financial\" + 0.058*\"buy\" + 0.031*\"tyres\"')\n", + "(9, '0.205*\"market\" + 0.106*\"natural\" + 0.102*\"spot\" + 0.097*\"domestic\" + 0.083*\"demand\" + 0.082*\"likely\" + 0.074*\"traders\" + 0.070*\"supply\"')\n", + "(10, '0.383*\"expect\" + 0.234*\"output\" + 0.102*\"estimate\" + 0.068*\"cover\" + 0.054*\"tree\" + 0.043*\"plant\" + 0.030*\"agriculture\" + 0.029*\"ministry\"')\n", + "(11, '0.333*\"accord\" + 0.257*\"tyre\" + 0.091*\"steady\" + 0.050*\"quote\" + 0.050*\"improve\" + 0.043*\"sector\" + 0.034*\"strength\" + 0.032*\"mix\"')\n", + "(12, '0.190*\"icex\" + 0.180*\"remain\" + 0.129*\"account\" + 0.083*\"trend\" + 0.082*\"level\" + 0.063*\"capital\" + 0.061*\"short\" + 0.056*\"resistance\"')\n", + "(13, '0.228*\"production\" + 0.182*\"stock\" + 0.117*\"high\" + 0.081*\"earlier\" + 0.055*\"growth\" + 0.046*\"average\" + 0.040*\"deal\" + 0.037*\"industry\"')\n", + "(14, '0.221*\"trade\" + 0.207*\"follow\" + 0.191*\"monday\" + 0.165*\"compare\" + 0.052*\"add\" + 0.038*\"strong\" + 0.029*\"early\" + 0.025*\"product\"')\n", + "(15, '0.182*\"crude\" + 0.159*\"futures\" + 0.120*\"tocom\" + 0.093*\"cue\" + 0.055*\"dollar\" + 0.053*\"trade\" + 0.051*\"shanghai\" + 0.046*\"weakness\"')\n", + "(16, '0.249*\"export\" + 0.218*\"report\" + 0.142*\"consumption\" + 0.103*\"government\" + 0.075*\"place\" + 0.054*\"second\" + 0.047*\"share\" + 0.032*\"farmers\"')\n", + "(17, '0.569*\"price\" + 0.152*\"rise\" + 0.113*\"fall\" + 0.072*\"fell\" + 0.025*\"variety\" + 0.024*\"sharp\" + 0.015*\"show\" + 0.007*\"prevent\"')\n", + "(18, '0.447*\"import\" + 0.383*\"month\" + 0.067*\"general\" + 0.020*\"statistics\" + 0.016*\"directorate\" + 0.016*\"natural\" + 0.015*\"commercial\" + 0.009*\"show\"')\n", + "(19, '0.289*\"support\" + 0.146*\"take\" + 0.089*\"near\" + 0.062*\"pressure\" + 0.052*\"begin\" + 0.043*\"plantation\" + 0.037*\"hold\" + 0.037*\"position\"')\n" + ] + } + ], + "source": [ + "topics = lda_model_batch.print_topics(num_words=8)\n", + "for topic in topics:\n", + " print(topic)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Perplexity: -11.818964763758563\n" + ] + } + ], + "source": [ + "print('Perplexity: ', lda_model_batch.log_perplexity(bow_corpus)) " + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Coherence Score for c_v: 0.3965778953756843\n" + ] + } + ], + "source": [ + "from gensim.models import CoherenceModel\n", + "coherence_model_lda_c_v = CoherenceModel(model=lda_model_batch, texts=processed_docs, dictionary=dictionary, coherence='c_v')\n", + "coherence_lda = coherence_model_lda_c_v.get_coherence()\n", + "print('Coherence Score for c_v: ', coherence_lda)" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Coherence Score for u_mass: -4.673057546428844\n" + ] + } + ], + "source": [ + "coherence_model_lda_u_mass = CoherenceModel(model=lda_model_batch, texts=processed_docs, dictionary=dictionary, coherence=\"u_mass\")\n", + "coherence_lda = coherence_model_lda_u_mass.get_coherence()\n", + "print('Coherence Score for u_mass: ', coherence_lda)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Optimal Topics" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):\n", + " coherence_values = []\n", + " model_list = []\n", + " for num_topics in range(start, limit, step):\n", + " model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)\n", + " model_list.append(model)\n", + " coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')\n", + " coherence_values.append(coherencemodel.get_coherence())\n", + "\n", + " return model_list, coherence_values" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mmodel_list\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcoherence_values\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcompute_coherence_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdictionary\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdictionary\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcorpus\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbow_corpus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtexts\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mprocessed_docs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m20\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstep\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mlimit\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m20\u001b[0m\u001b[1;33m;\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m;\u001b[0m \u001b[0mstep\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m;\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mx\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstart\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstep\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcoherence_values\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mxlabel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Num Topics\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m\u001b[0m in \u001b[0;36mcompute_coherence_values\u001b[1;34m(dictionary, corpus, texts, limit, start, step)\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mmodel_list\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mnum_topics\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstart\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstep\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mmodel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mLdaModel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mid2word\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdictionary\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnum_topics\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnum_topics\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[0mmodel_list\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mcoherencemodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mCoherenceModel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtexts\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtexts\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdictionary\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdictionary\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcoherence\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'c_v'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\gensim-3.8.1-py3.7-win-amd64.egg\\gensim\\models\\ldamodel.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, corpus, num_topics, id2word, distributed, chunksize, passes, update_every, alpha, eta, decay, offset, eval_every, iterations, gamma_threshold, minimum_probability, random_state, ns_conf, minimum_phi_value, per_word_topics, callbacks, dtype)\u001b[0m\n\u001b[0;32m 517\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcorpus\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 518\u001b[0m \u001b[0muse_numpy\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdispatcher\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 519\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mchunks_as_numpy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0muse_numpy\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 520\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 521\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0minit_dir_prior\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mprior\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\gensim-3.8.1-py3.7-win-amd64.egg\\gensim\\models\\ldamodel.py\u001b[0m in \u001b[0;36mupdate\u001b[1;34m(self, corpus, chunksize, decay, offset, passes, update_every, eval_every, iterations, gamma_threshold, chunks_as_numpy)\u001b[0m\n\u001b[0;32m 978\u001b[0m \u001b[0mpass_\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mchunk_no\u001b[0m \u001b[1;33m*\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlencorpus\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 979\u001b[0m )\n\u001b[1;32m--> 980\u001b[1;33m \u001b[0mgammat\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdo_estep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mother\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 981\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 982\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptimize_alpha\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\gensim-3.8.1-py3.7-win-amd64.egg\\gensim\\models\\ldamodel.py\u001b[0m in \u001b[0;36mdo_estep\u001b[1;34m(self, chunk, state)\u001b[0m\n\u001b[0;32m 740\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mstate\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 741\u001b[0m \u001b[0mstate\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 742\u001b[1;33m \u001b[0mgamma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msstats\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minference\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcollect_sstats\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 743\u001b[0m \u001b[0mstate\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msstats\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[0msstats\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 744\u001b[0m \u001b[0mstate\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnumdocs\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[0mgamma\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;31m# avoids calling len(chunk) on a generator\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\gensim-3.8.1-py3.7-win-amd64.egg\\gensim\\models\\ldamodel.py\u001b[0m in \u001b[0;36minference\u001b[1;34m(self, chunk, collect_sstats)\u001b[0m\n\u001b[0;32m 694\u001b[0m \u001b[0mElogthetad\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdirichlet_expectation\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgammad\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 695\u001b[0m \u001b[0mexpElogthetad\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexp\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mElogthetad\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 696\u001b[1;33m \u001b[0mphinorm\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mexpElogthetad\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexpElogbetad\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mepsilon\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 697\u001b[0m \u001b[1;31m# If gamma hasn't changed much, we're done.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 698\u001b[0m \u001b[0mmeanchange\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmean_absolute_difference\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgammad\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlastgamma\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=bow_corpus, texts=processed_docs, start=2, limit=20, step=1)\n", + "limit=20; start=2; step=1;\n", + "x = range(start, limit, step)\n", + "plt.plot(x, coherence_values)\n", + "plt.xlabel(\"Num Topics\")\n", + "plt.ylabel(\"Coherence score\")\n", + "plt.xticks(x)\n", + "plt.legend((\"coherence_values\"), loc='best')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Best model run" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [], + "source": [ + "lda_model_best = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,\n", + " id2word=dictionary,\n", + " num_topics=4, \n", + " random_state=100,\n", + " update_every=1,\n", + " chunksize=100,\n", + " passes=10,\n", + " alpha='auto',\n", + " per_word_topics=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "lda_model_best.save('model_bow_best.gensim')" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, '0.057*\"price\" + 0.048*\"market\" + 0.037*\"rise\" + 0.036*\"natural\" + 0.036*\"domestic\" + 0.033*\"demand\" + 0.032*\"likely\" + 0.030*\"supply\" + 0.027*\"fall\" + 0.025*\"spot\"'), (1, '0.021*\"month\" + 0.020*\"year\" + 0.018*\"national\" + 0.017*\"give\" + 0.016*\"multi\" + 0.015*\"company\" + 0.014*\"index\" + 0.013*\"price\" + 0.013*\"heavy\" + 0.012*\"state\"'), (2, '0.129*\"rupees\" + 0.074*\"price\" + 0.073*\"contract\" + 0.064*\"close\" + 0.046*\"commodity\" + 0.043*\"exchange\" + 0.034*\"previous\" + 0.032*\"market\" + 0.027*\"end\" + 0.026*\"active\"'), (3, '0.064*\"price\" + 0.056*\"contract\" + 0.051*\"crude\" + 0.051*\"futures\" + 0.048*\"exchange\" + 0.046*\"analysts\" + 0.038*\"tocom\" + 0.030*\"fell\" + 0.028*\"accord\" + 0.026*\"cue\"')]\n" + ] + } + ], + "source": [ + "print(lda_model_best.print_topics())\n", + "doc_lda_best = lda_model_best[bow_corpus]" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, '0.057*\"price\" + 0.048*\"market\" + 0.037*\"rise\" + 0.036*\"natural\" + 0.036*\"domestic\"')\n", + "(1, '0.021*\"month\" + 0.020*\"year\" + 0.018*\"national\" + 0.017*\"give\" + 0.016*\"multi\"')\n", + "(2, '0.129*\"rupees\" + 0.074*\"price\" + 0.073*\"contract\" + 0.064*\"close\" + 0.046*\"commodity\"')\n", + "(3, '0.064*\"price\" + 0.056*\"contract\" + 0.051*\"crude\" + 0.051*\"futures\" + 0.048*\"exchange\"')\n" + ] + } + ], + "source": [ + "topics_best = lda_model_best.print_topics(num_words=5)\n", + "for topic in topics_best:\n", + " print(topic)" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, '0.057*\"price\" + 0.048*\"market\" + 0.037*\"rise\" + 0.036*\"natural\" + 0.036*\"domestic\" + 0.033*\"demand\" + 0.032*\"likely\" + 0.030*\"supply\"')\n", + "(1, '0.021*\"month\" + 0.020*\"year\" + 0.018*\"national\" + 0.017*\"give\" + 0.016*\"multi\" + 0.015*\"company\" + 0.014*\"index\" + 0.013*\"price\"')\n", + "(2, '0.129*\"rupees\" + 0.074*\"price\" + 0.073*\"contract\" + 0.064*\"close\" + 0.046*\"commodity\" + 0.043*\"exchange\" + 0.034*\"previous\" + 0.032*\"market\"')\n", + "(3, '0.064*\"price\" + 0.056*\"contract\" + 0.051*\"crude\" + 0.051*\"futures\" + 0.048*\"exchange\" + 0.046*\"analysts\" + 0.038*\"tocom\" + 0.030*\"fell\"')\n" + ] + } + ], + "source": [ + "topics_best = lda_model_best.print_topics(num_words=8)\n", + "for topic in topics_best:\n", + " print(topic)" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Perplexity: -5.768329720101713\n" + ] + } + ], + "source": [ + "print('Perplexity: ', lda_model_best.log_perplexity(bow_corpus)) " + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Coherence Score for c_v: 0.7160431196112311\n" + ] + } + ], + "source": [ + "from gensim.models import CoherenceModel\n", + "coherence_model_lda_c_v = CoherenceModel(model=lda_model_best, texts=processed_docs, dictionary=dictionary, coherence='c_v')\n", + "coherence_lda = coherence_model_lda_c_v.get_coherence()\n", + "print('Coherence Score for c_v: ', coherence_lda)" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [], + "source": [ + "def get_lda_topics(model, num_topics):\n", + " word_dict = {};\n", + " for i in range(num_topics):\n", + " words = model.show_topic(i, topn = 40);\n", + " word_dict['Topic #' + '{:02d}'.format(i)] = [i[0] for i in words];\n", + " return pd.DataFrame(word_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Topic #00Topic #01Topic #02Topic #03
0pricemonthrupeesprice
1marketyearpricecontract
2risenationalcontractcrude
3naturalgiveclosefutures
4domesticmulticommodityexchange
5demandcompanyexchangeanalysts
6likelyindexprevioustocom
7supplypricemarketfell
8fallheavyendaccord
9spotstateactivecue
10tradersexportboursenatural
11comeincreasetradetrade
12basecurrencytradersrise
13tyremakesettlenmce
14dayshighvarietyweek
15raininternationalwidelysynthetic
16commentcomparespotdollar
17importexpectrupeesubstitute
18tradernearmondayfall
19outlookwholesaleweekcommodity
20seemarketfollowshanghai
21outputdenominaterisebenchmark
22gainmanufacturedetailmercantile
23globalreportbenchmarkyork
24productionmonthsshowwarehouse
25endusmajorunchangedtrack
26largestserviceindianmanufacture
27expectationsproductionsellgain
28remaincountryicexglobal
29countriesproductshigherstock
30tapreleasetrackdecline
31expectbuyersfuturescompare
32arrivalsdepartmentgainsupport
33supportperiodhighhigher
34sharpstocklowerweigh
35stockistsearlierweaknessmonth
36sessionsautomonthaccredit
37manufacturersfloodbuylower
38associationquarternaturalweak
39seasontakesteadyconcern
\n", + "
" + ], + "text/plain": [ + " Topic #00 Topic #01 Topic #02 Topic #03\n", + "0 price month rupees price\n", + "1 market year price contract\n", + "2 rise national contract crude\n", + "3 natural give close futures\n", + "4 domestic multi commodity exchange\n", + "5 demand company exchange analysts\n", + "6 likely index previous tocom\n", + "7 supply price market fell\n", + "8 fall heavy end accord\n", + "9 spot state active cue\n", + "10 traders export bourse natural\n", + "11 come increase trade trade\n", + "12 base currency traders rise\n", + "13 tyre make settle nmce\n", + "14 days high variety week\n", + "15 rain international widely synthetic\n", + "16 comment compare spot dollar\n", + "17 import expect rupee substitute\n", + "18 trader near monday fall\n", + "19 outlook wholesale week commodity\n", + "20 see market follow shanghai\n", + "21 output denominate rise benchmark\n", + "22 gain manufacture detail mercantile\n", + "23 global report benchmark york\n", + "24 production months show warehouse\n", + "25 endus major unchanged track\n", + "26 largest service indian manufacture\n", + "27 expectations production sell gain\n", + "28 remain country icex global\n", + "29 countries products higher stock\n", + "30 tap release track decline\n", + "31 expect buyers futures compare\n", + "32 arrivals department gain support\n", + "33 support period high higher\n", + "34 sharp stock lower weigh\n", + "35 stockists earlier weakness month\n", + "36 sessions auto month accredit\n", + "37 manufacturers flood buy lower\n", + "38 association quarter natural weak\n", + "39 season take steady concern" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_lda_topics(lda_model_best, 4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize topic keywords from best model" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')\n", + "corpus = pickle.load(open('corpus.pkl', 'rb'))\n", + "lda = gensim.models.ldamodel.LdaModel.load('model_bow_best.gensim')" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pyLDAvis\\_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " return pd.concat([default_term_info] + list(topic_dfs))\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pyLDAvis.gensim\n", + "lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)\n", + "pyLDAvis.display(lda_display)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train Data Mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [], + "source": [ + "documents_copy= documents.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_storypreprocess_join
0Sumitomo Rubber Industries has established a n...sumitomo industries establish natural procurem...
1Spot rubber closed unchanged on Thursday RSS ...spot close unchanged quote steady traders fini...
2Delegate registration for India Rubber Meet ...delegate registration meet begin meet hold rad...
3MUMBAI Futures contracts of rubber on the In...futures contract indian commodity exchange ris...
4Tapping has been delayed despite the fact that...tap delay despite fact peak season decline imp...
.........
5270Cogencis Wednesday Apr NEW DELHI India ...natural production fell year accord provisiona...
5271Cogencis Tuesday Apr By Prabhnoor Nanda N...futures contract national multi commodity exch...
5272OUTLOOK Futures contracts of natural rubber ma...outlook futures contract natural trade lower s...
5273Cogencis Tuesday Apr By Shikha Singh NEW ...price natural fell spot market poor demand tra...
5274Cogencis Monday Apr By Prabhnoor Nanda NEW...monday futures contract national multi commodi...
\n", + "

5275 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " full_story \\\n", + "0 Sumitomo Rubber Industries has established a n... \n", + "1 Spot rubber closed unchanged on Thursday RSS ... \n", + "2 Delegate registration for India Rubber Meet ... \n", + "3 MUMBAI Futures contracts of rubber on the In... \n", + "4 Tapping has been delayed despite the fact that... \n", + "... ... \n", + "5270 Cogencis Wednesday Apr NEW DELHI India ... \n", + "5271 Cogencis Tuesday Apr By Prabhnoor Nanda N... \n", + "5272 OUTLOOK Futures contracts of natural rubber ma... \n", + "5273 Cogencis Tuesday Apr By Shikha Singh NEW ... \n", + "5274 Cogencis Monday Apr By Prabhnoor Nanda NEW... \n", + "\n", + " preprocess_join \n", + "0 sumitomo industries establish natural procurem... \n", + "1 spot close unchanged quote steady traders fini... \n", + "2 delegate registration meet begin meet hold rad... \n", + "3 futures contract indian commodity exchange ris... \n", + "4 tap delay despite fact peak season decline imp... \n", + "... ... \n", + "5270 natural production fell year accord provisiona... \n", + "5271 futures contract national multi commodity exch... \n", + "5272 outlook futures contract natural trade lower s... \n", + "5273 price natural fell spot market poor demand tra... \n", + "5274 monday futures contract national multi commodi... \n", + "\n", + "[5275 rows x 2 columns]" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents_copy" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_topic_sentence(row):\n", + " bow_vector = dictionary.doc2bow(preprocess(row['full_story']))\n", + " row1= lda_model_best[bow_vector][0]\n", + " row1 = sorted(row1, key=lambda x: (x[1]), reverse=True)\n", + " \n", + " topic_num= row1[0][0]\n", + " prop_topic= row1[0][1]\n", + " wp = lda_model_best.show_topic(topic_num)\n", + " topic_keywords = \", \".join([word for word, prop in wp])\n", + " row['Topic_NUmber']= int(topic_num)\n", + " if int(topic_num)== 1:\n", + " row['Topic_name']= 'Domestic market supply demand'\n", + " elif int(topic_num)== 2:\n", + " row['Topic_name']= 'Rubber price comparison'\n", + " elif int(topic_num)== 3:\n", + " row['Topic_name']= 'Commodity market price'\n", + " else:\n", + " row['Topic_name']= 'Stock exchange analysis' \n", + " row['Topic_Propability']= round(prop_topic,3)\n", + " row['Keywords']= topic_keywords\n", + " \n", + " return row" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "doc= documents_copy.apply(identify_topic_sentence,axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [], + "source": [ + "doc.drop(['preprocess_join'],axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_storyTopic_NUmberTopic_nameTopic_PropabilityKeywords
0Sumitomo Rubber Industries has established a n...0Stock exchange analysis0.462price, market, rise, natural, domestic, demand...
1Spot rubber closed unchanged on Thursday RSS ...2Rubber price comparison0.757rupees, price, contract, close, commodity, exc...
2Delegate registration for India Rubber Meet ...1Domestic market supply demand0.899month, year, national, give, multi, company, i...
3MUMBAI Futures contracts of rubber on the In...2Rubber price comparison0.742rupees, price, contract, close, commodity, exc...
4Tapping has been delayed despite the fact that...0Stock exchange analysis0.892price, market, rise, natural, domestic, demand...
..................
5270Cogencis Wednesday Apr NEW DELHI India ...0Stock exchange analysis0.424price, market, rise, natural, domestic, demand...
5271Cogencis Tuesday Apr By Prabhnoor Nanda N...0Stock exchange analysis0.450price, market, rise, natural, domestic, demand...
5272OUTLOOK Futures contracts of natural rubber ma...3Commodity market price0.685price, contract, crude, futures, exchange, ana...
5273Cogencis Tuesday Apr By Shikha Singh NEW ...0Stock exchange analysis0.392price, market, rise, natural, domestic, demand...
5274Cogencis Monday Apr By Prabhnoor Nanda NEW...3Commodity market price0.447price, contract, crude, futures, exchange, ana...
\n", + "

5275 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " full_story Topic_NUmber \\\n", + "0 Sumitomo Rubber Industries has established a n... 0 \n", + "1 Spot rubber closed unchanged on Thursday RSS ... 2 \n", + "2 Delegate registration for India Rubber Meet ... 1 \n", + "3 MUMBAI Futures contracts of rubber on the In... 2 \n", + "4 Tapping has been delayed despite the fact that... 0 \n", + "... ... ... \n", + "5270 Cogencis Wednesday Apr NEW DELHI India ... 0 \n", + "5271 Cogencis Tuesday Apr By Prabhnoor Nanda N... 0 \n", + "5272 OUTLOOK Futures contracts of natural rubber ma... 3 \n", + "5273 Cogencis Tuesday Apr By Shikha Singh NEW ... 0 \n", + "5274 Cogencis Monday Apr By Prabhnoor Nanda NEW... 3 \n", + "\n", + " Topic_name Topic_Propability \\\n", + "0 Stock exchange analysis 0.462 \n", + "1 Rubber price comparison 0.757 \n", + "2 Domestic market supply demand 0.899 \n", + "3 Rubber price comparison 0.742 \n", + "4 Stock exchange analysis 0.892 \n", + "... ... ... \n", + "5270 Stock exchange analysis 0.424 \n", + "5271 Stock exchange analysis 0.450 \n", + "5272 Commodity market price 0.685 \n", + "5273 Stock exchange analysis 0.392 \n", + "5274 Commodity market price 0.447 \n", + "\n", + " Keywords \n", + "0 price, market, rise, natural, domestic, demand... \n", + "1 rupees, price, contract, close, commodity, exc... \n", + "2 month, year, national, give, multi, company, i... \n", + "3 rupees, price, contract, close, commodity, exc... \n", + "4 price, market, rise, natural, domestic, demand... \n", + "... ... \n", + "5270 price, market, rise, natural, domestic, demand... \n", + "5271 price, market, rise, natural, domestic, demand... \n", + "5272 price, contract, crude, futures, exchange, ana... \n", + "5273 price, market, rise, natural, domestic, demand... \n", + "5274 price, contract, crude, futures, exchange, ana... \n", + "\n", + "[5275 rows x 5 columns]" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [], + "source": [ + "doc.to_excel('LDA_BOW_Input_Mapping.xlsx')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [], + "source": [ + "test_document1 = 'MUMBAI – Rubber contracts on the Indian Commodity Exchange fell today tracking the benchmark contracts on Tokyo Commodity Exchange, traders said. The most-active February contract ended at 14,100 rupees per 100 kg, down 0.2% from the pervious close'\n", + "bow_vector = dictionary.doc2bow(preprocess(test_document1))" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, 0.010566269), (1, 0.96880543), (2, 0.013036869)]\n" + ] + } + ], + "source": [ + "print(lda_model_best.get_document_topics(bow_vector))" + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Word (0, 1) (\"active\") \n", + "Word (1, 1) (\"benchmark\") \n", + "Word (2, 1) (\"close\") \n", + "Word (3, 2) (\"commodity\") \n", + "Word (4, 3) (\"contract\") \n", + "Word (5, 2) (\"exchange\") \n", + "Word (7, 1) (\"february\") \n", + "Word (8, 1) (\"indian\") \n", + "Word (9, 1) (\"mumbai\") \n", + "Word (11, 1) (\"today\") \n", + "Word (12, 1) (\"tokyo\") \n", + "Word (13, 1) (\"track\") \n" + ] + } + ], + "source": [ + "for i in range(len(bow_vector)):\n", + " print(\"Word {} (\\\"{}\\\") \".format(bow_vector[i], \n", + " dictionary[bow_vector[i][0]]))" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# lda_model_best[bow_vector]" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "metadata": {}, + "outputs": [], + "source": [ + "sent_topics_df = pd.DataFrame()\n", + "row= lda_model_best[bow_vector][0]\n", + "row = sorted(row, key=lambda x: (x[1]), reverse=True)\n", + "\n", + "\n", + "\n", + "# for i, (topic_num, prop_topic) in enumerate(row):\n", + "# wp = lda_model_best.show_topic(topic_num)\n", + "# topic_keywords = \", \".join([word for word, prop in wp])\n", + "# sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)\n", + "\n", + "# print(sent_topics_df)\n", + "\n", + "topic_num= row[0][0]\n", + "prop_topic= row[0][1]\n", + "wp = lda_model_best.show_topic(topic_num)\n", + "topic_keywords = \", \".join([word for word, prop in wp])\n", + "sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "metadata": {}, + "outputs": [], + "source": [ + "contents = pd.Series(test_document1)\n", + "sent_topics_df = pd.concat([contents,sent_topics_df], axis=1, ignore_index= True)\n", + "sent_topics_df.rename(columns = {0:'Input_document', 1: 'Topic_NUmber', 2: 'Topic_Propability', 3: 'Keywords'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Input_documentTopic_NUmberTopic_PropabilityKeywords
0MUMBAI – Rubber contracts on the Indian Commod...1.00.9688contract, exchange, rupee, commodity, price, f...
\n", + "
" + ], + "text/plain": [ + " Input_document Topic_NUmber \\\n", + "0 MUMBAI – Rubber contracts on the Indian Commod... 1.0 \n", + "\n", + " Topic_Propability Keywords \n", + "0 0.9688 contract, exchange, rupee, commodity, price, f... " + ] + }, + "execution_count": 193, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sent_topics_df" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [], + "source": [ + "test_document3 = 'Today thailand recieved the highest rainfall. It may effect the production of natural rubber as well to a greater extent.'\n", + "bow_vector = dictionary.doc2bow(preprocess(test_document3))" + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, 0.46492192), (1, 0.024663068), (2, 0.3061411), (3, 0.20427391)]\n" + ] + } + ], + "source": [ + "print(lda_model_best.get_document_topics(bow_vector))" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "sent_topics_df = pd.DataFrame()\n", + "row= lda_model_best[bow_vector][0]\n", + "row = sorted(row, key=lambda x: (x[1]), reverse=True)\n", + "\n", + "topic_num= row[0][0]\n", + "prop_topic= row[0][1]\n", + "wp = lda_model_best.show_topic(topic_num)\n", + "topic_keywords = \", \".join([word for word, prop in wp])\n", + "sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [], + "source": [ + "contents = pd.Series(test_document1)\n", + "sent_topics_df = pd.concat([contents,sent_topics_df], axis=1, ignore_index= True)\n", + "sent_topics_df.rename(columns = {0:'Input_document', 1: 'Topic_NUmber', 2: 'Topic_Propability', 3: 'Keywords'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Input_documentTopic_NUmberTopic_PropabilityKeywords
0MUMBAI – Rubber contracts on the Indian Commod...0.00.4649india, month, year, send, comment, feedback, r...
\n", + "
" + ], + "text/plain": [ + " Input_document Topic_NUmber \\\n", + "0 MUMBAI – Rubber contracts on the Indian Commod... 0.0 \n", + "\n", + " Topic_Propability Keywords \n", + "0 0.4649 india, month, year, send, comment, feedback, r... " + ] + }, + "execution_count": 186, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sent_topics_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}