DeepLearningNLP/SpellingWithContextIterator.lua at master · joiharalds/DeepLearningNLP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
-- We want to be able to take a string of sentences seperated by <eos> and use each of them
-- independently to learn how to fill in a word given the context. What we want to accomplish
-- is a function which maps
--
-- Step 1:
-- break string on <eos>:
-- Feed sentence 1 as it is correctly as an input
-- x_1 x_2 ... x_n
-- Then make a batch such that each word occurs once
-- <unk> x_2 ... x_n
-- x_1 <unk> ... x_n
-- .
-- .
-- .
-- x_1 x_2 ... <unk>
--
-- for each try to predict the correct value for <unk>
--
-- the corresponding output should be
--
-- y_1 x_2 ... x_n
-- x_1 y_2 ... x_n
-- .
-- .
-- .
-- x_1 x_2 ... y_n
--
-- and the corresponding output should be
--
-- x_1 x_2 ... x_n
-- x_1 x_2 ... x_n
-- .
-- .
-- .
-- x_1 x_2 ... x_n
--
-- Make a table that associates outputs of the neural network with the corresponding targets
-- With a certain probability replace the top ten most common outputs with the input. During the training regime to obtain corrections.
--
-- Suppose text is all our testdata
local txt_load_util = require 'txt_load_util.lua'
local text = txt_load_util.getAlthingi('test')
local utf8 = require 'lua-utf8'
-- We add this function to txt_load_util
function string:split(scale) -- scale: char, word, snt, blob (TBI),txt
    local N = utf8.len(self)
    local token -- token depends on the scale
    local split_start = 1
    -- How we choose splitend depends on the scale
    -- If scale is char then it's the next char position
    -- If scale is word then it's the position as determined by find with token = ' '
    -- If scale is snt then it's the position as determined by snt with token = '<eos>'
    -- If scale is blob then it's the position as determined by some attention mechanism TBI
    local split_end
    local output = {}
    if scale == 'char' then
        for i=1,N do
            output[i] = utf8.sub(self,split_start, split_start)
            split_start = split_start + 1
        end
    else

        if scale == 'word' then
            token = ' '
        elseif scale == 'snt' then
            token = '<eos>'
        end
        -- Problem: utf8.find returns nil if there's no remaining ' ' left.
        -- Check if split_end = nil, if so grab set split_end = N
        while split_start <= N do -- TODO use tokens to eliminate repeated spaces
            _, split_end = utf8.find(self,token,split_start)
            if split_end == nil then split_end = N end
            local str = utf8.sub(self,split_start, split_end)
            if str ~= token then
                table.insert(output, word)
            end
            split_start = split_end + 1
        end
    end

    return output
end
-- Example of usage:
--th> txt = 'Ég fór út að labba.<eos>Hvernig var veðrið?<eos>Hvar er mamma?<eos>Hún var skelfingu lostin!<eos>'
--                                                                      [0.0000s]
--th> txt:split('<eos>')
--{
--  1 : "Ég fór út að labba."
--  2 : "Hvernig var veðrið?"
--  3 : "Hvar er mamma?"
--  4 : "Hún var skelfingu lostin!"
--}
-- For each sentence we now turn it into the problem to solve
-- For example for sentence 1
-- <unk> fór út að labba.
-- Ég <unk> út að labba.
-- Ég fór <unk> að labba.
-- Ég fór út <unk> labba.
-- Ég fór út að <unk>
function string:test()
    local txt = 'Éfór út að labba.<eos>Hvernig var veðrið?<eos>Hvar er mamma?<eos>Hún var skelfingu lostin!<eos>'
-- splittar í setningar
    local sentences = txt:split('<eos>')
    return sentences
end

-- How to define a sequence_loader class

local sequence_loader = torch.class('sequence_loader')

function sequence_loader:__init(sequence, batchsize, bidirectional)
   assert(torch.isTensor(sequence))
   assert(torch.type(batchsize) == 'number')
   -- sequence is a tensor where the first dimension indexes time
   self.batchsize = batchsize
   self.bidirectional = bidirectional
   local seqlen = sequence:size(1)
   local size = sequence:size():totable()
   table.remove(size, 1)
   assert(#size == sequence:dim() - 1)
   self.data = sequence.new()
   -- note that some data will be lost
   -- Number of batches
   local seqlen2 = torch.floor(seqlen / batchsize)
   -- seqlen2 x batchsize
   self.data = sequence:sub(1,seqlen2*batchsize):view(batchsize, seqlen2):t():contiguous()
end

A = sequence_loader(torch.rand(5),5,false) -- works but is meaningless
-- input 1 : sequence in tensor form
-- input 2 : batchsize - i.e. the size of each batch
-- input 3 : bidirectional - true or false
-- To prepare input 1:

function txt_load_util.get_raw_data(txt_set,datapath)
    -- Dependencies
    local file = require('pl.file')
    local stringx = require('pl.stringx')
    -- path to directory containing Althingi dataset on disk
    -- This is the current default if no argument given
    -- the dir contains train.txt, valid.txt and test.txt
    datapath = datapath or '/home/thj92/DeepLearningNLP/Data/'
    -- 2. load raw data,
    local filename = 'althingi.'..txt_set..'.txt'
    local filepath = paths.concat(datapath, filename)
    local text = file.read(filepath)
    text = stringx.replace(text, '\n', '<eos>')
    return text
end

local text = txt_load_util.get_raw_data(txt_set, '/home/thj92/DeepLearningNLP/Data/')

-- tokens contain all the characters from the whole sequence representing our document.
-- We are going to build a table containing all the different unique tokens
function txt_load_util.buildVocab(tokens)
    local vocab = {}
	local ivocab = {}
	local counter = 1
    -- Store each character as they appear in tokens
    for i=1,#tokens do
        local char = tokens[i]
		if vocab[char] == nil then
			ivocab[counter] = char
			vocab[char] = counter
			counter = counter + 1
		end
	end
	return vocab, ivocab
end

local charvocab, icharvocab = txt_load_util.buildVocab(tokens)

function txt_load_util.text2tensor(tokens,vocab)
    -- Build a tensor with a length which corresponds to size of vocabulary
    -- Each element in the vocab will receive an entry in this one-dimensional tensor
    local tensor = torch.IntTensor(#tokens):fill(0)
    -- Each number receives word id which corresponds to its frequency
    for i, char in ipairs(tokens) do
        tensor[i] = vocab[char]
    end
    return tensor
end

local tensor = txt_load_util.text2tensor(tokens, charvocab)

local loader = sequence_loader(tensor,batchsize,true)

-- To disseminate tomorrow!

-- subiter : for iterating over validation and test sets
function DataLoader:subiter(batchsize, epochsize, ...)
   batchsize = batchsize or 32
   local dots = {...}
   local size = self:size()
   epochsize = epochsize or -1
   epochsize = epochsize > 0 and epochsize or self:size()
   self._start = self._start or 1
   local nsampled = 0
   local stop

   local inputs, targets

   -- build iterator
   return function()
      if nsampled >= epochsize then
         return
      end

      local bs = math.min(nsampled+batchsize, epochsize) - nsampled
      stop = math.min(self._start + bs - 1, size)
      -- inputs and targets
      local batch = {self:sub(self._start, stop, inputs, targets, unpack(dots))}
      -- allows reuse of inputs and targets buffers for next iteration
      inputs, targets = batch[1], batch[2]

      bs = stop - self._start + 1
      nsampled = nsampled + bs
      self._start = self._start + bs
      if self._start > size then
         self._start = 1
      end

      self:collectgarbage()

      return nsampled, unpack(batch)
   end
end