From 2adb27457fa5aebb54db22f415dc45fcecb0a007 Mon Sep 17 00:00:00 2001 From: Arsham Shirvani Date: Fri, 4 Feb 2022 15:37:15 +0000 Subject: [PATCH 1/3] ref(vector): replace empty interface with element In this commit the empty interfaces are replaces with Element interfaces, and instead of relying on type inference, we ask the vector to provide a concrete value. This has greatly increased the throughput of the program. --- Makefile | 2 +- file/chunk.go | 3 +- file/file.go | 6 +-- file/sort.go | 9 ++-- main.go | 12 +++-- main_test.go | 32 +++++++---- testdata/table.shuffled.tsv | 104 ++++++++++++++++++++++++++++++++++++ testdata/table.sorted.tsv | 104 ++++++++++++++++++++++++++++++++++++ vector/element.go | 16 ++++++ vector/int_vector.go | 49 ++++++++--------- vector/string_vector.go | 35 ++++++------ vector/table_vector.go | 89 ++++++++++++++++++++++++++++++ vector/vector.go | 33 ++++++++---- 13 files changed, 420 insertions(+), 74 deletions(-) create mode 100644 testdata/table.shuffled.tsv create mode 100644 testdata/table.sorted.tsv create mode 100644 vector/element.go create mode 100644 vector/table_vector.go diff --git a/Makefile b/Makefile index 3999d13..ebf5e35 100644 --- a/Makefile +++ b/Makefile @@ -4,4 +4,4 @@ test: .PHONY: run run: - go run main.go \ No newline at end of file + go run main.go diff --git a/file/chunk.go b/file/chunk.go index 1282931..89cdce0 100644 --- a/file/chunk.go +++ b/file/chunk.go @@ -5,7 +5,6 @@ import ( "os" "github.com/askiada/external-sort/vector" - "github.com/pkg/errors" ) @@ -92,7 +91,7 @@ func (c *chunks) len() int { } // min Check all the first elements of all the chunks and returns the smallest value. -func (c chunks) min() (minChunk *chunkInfo, minValue interface{}, minIdx int) { +func (c chunks) min() (minChunk *chunkInfo, minValue vector.Element, minIdx int) { for i, chunk := range c.list { currValue := chunk.buffer.Get(0) if i == 0 { diff --git a/file/file.go b/file/file.go index 3bf9624..945e6f4 100644 --- a/file/file.go +++ b/file/file.go @@ -23,9 +23,9 @@ func (f *Info) Sort(file io.Reader) error { scanner := bufio.NewScanner(file) for scanner.Scan() { text := scanner.Text() - err := vector.Sort(ans, text) + err := vector.Sort(ans, text, "\t", 0) if err != nil { - return err + return errors.Wrap(err, "sorting file") } } if scanner.Err() != nil { @@ -64,7 +64,7 @@ func (f *Info) CreateSortedChunks(chunkFolder string, dumpSize int) ([]string, e ans = f.Allocate(dumpSize) } text := scanner.Text() - err := vector.Sort(ans, text) + err := vector.Sort(ans, text, "\t", 0) if err != nil { return nil, errors.Wrap(err, fn) } diff --git a/file/sort.go b/file/sort.go index 30b567d..26243e0 100644 --- a/file/sort.go +++ b/file/sort.go @@ -3,6 +3,8 @@ package file import ( "fmt" "runtime" + + "github.com/askiada/external-sort/vector" ) type MemUsage struct { @@ -34,7 +36,7 @@ func bToMb(b uint64) uint64 { return b / 1024 / 1024 } -func (f *Info) MergeSort(chunkPaths []string, k int) (output []interface{}, err error) { +func (f *Info) MergeSort(chunkPaths []string, k int) (output []vector.Element, err error) { mu := &MemUsage{} // create a chunk per file path chunks := &chunks{list: make([]*chunkInfo, 0, len(chunkPaths))} @@ -44,10 +46,7 @@ func (f *Info) MergeSort(chunkPaths []string, k int) (output []interface{}, err return nil, err } } - for { - if chunks.len() == 0 { - break - } + for chunks.len() > 0 { mu.Collect() toShrink := []int{} // search the smallest value across chunk buffers by comparing first elements only diff --git a/main.go b/main.go index ca6ee43..4cc5d3b 100644 --- a/main.go +++ b/main.go @@ -31,22 +31,26 @@ func main() { defer f.Close() fI := &file.Info{ Reader: f, - Allocate: vector.AllocateIntVector, + Allocate: vector.AllocateTableVector("\t", 0), } // create small files with maximum 30 rows in each - chunkPaths, err := fI.CreateSortedChunks("data/chunks", 4) + chunkPaths, err := fI.CreateSortedChunks("data/chunks", 1000) if err != nil { panic(err) } + // TODO: remove files if the process crashes. + // perform a merge sort on all the chunks files. // we sort using a buffer so we don't have to load the entire chunks when merging - output, err := fI.MergeSort(chunkPaths, 3) + output, err := fI.MergeSort(chunkPaths, 10_000) if err != nil { panic(err) } // this output could be saved on hard drive // or we can imagine send events everytime an element is added to it // of course it will require MergeSort to return a channel - fmt.Println(output) + for _, line := range output { + fmt.Println(line.Value()) + } } diff --git a/main_test.go b/main_test.go index e8eda13..4d87f65 100644 --- a/main_test.go +++ b/main_test.go @@ -2,15 +2,15 @@ package main_test import ( "errors" - "io/ioutil" "os" "path" + "strings" "testing" "github.com/askiada/external-sort/file" "github.com/askiada/external-sort/vector" - "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func prepareChunks(t *testing.T, filname string, chunkSize int) (*file.Info, []string) { @@ -20,14 +20,14 @@ func prepareChunks(t *testing.T, filname string, chunkSize int) (*file.Info, []s fI := &file.Info{ Reader: f, - Allocate: vector.AllocateIntVector, + Allocate: vector.AllocateTableVector("", 0), } chunkPaths, err := fI.CreateSortedChunks("testdata/chunks", chunkSize) assert.NoError(t, err) t.Cleanup(func() { defer f.Close() - dir, err := ioutil.ReadDir("testdata/chunks") + dir, err := os.ReadDir("testdata/chunks") assert.NoError(t, err) for _, d := range dir { err = os.RemoveAll(path.Join("testdata/chunks", d.Name())) @@ -38,22 +38,32 @@ func prepareChunks(t *testing.T, filname string, chunkSize int) (*file.Info, []s return fI, chunkPaths } -func Test(t *testing.T) { +func TestMergeSort(t *testing.T) { + got, err := os.ReadFile("testdata/table.sorted.tsv") + require.NoError(t, err) + + tableSorted := strings.Split(string(got), "\n") + tableSorted = tableSorted[:len(tableSorted)-1] + tcs := map[string]struct { filename string expectedErr error - expectedOutput []interface{} + expectedOutput []string }{ "empty file": { filename: "testdata/emptyfile.tsv", }, "one elem": { filename: "testdata/oneelem.tsv", - expectedOutput: []interface{}{1}, + expectedOutput: []string{"1"}, }, "100 elems": { filename: "testdata/100elems.tsv", - expectedOutput: []interface{}{3, 4, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 15, 18, 18, 18, 18, 21, 22, 22, 25, 25, 25, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 29, 30, 30, 31, 31, 33, 33, 34, 36, 37, 39, 39, 39, 40, 41, 41, 42, 43, 43, 47, 47, 49, 50, 50, 52, 52, 53, 54, 55, 55, 55, 56, 57, 57, 59, 60, 61, 62, 63, 67, 71, 71, 72, 72, 73, 74, 75, 78, 79, 80, 80, 82, 89, 89, 89, 91, 91, 92, 92, 93, 93, 94, 97, 97, 99}, + expectedOutput: []string{"3", "4", "5", "6", "6", "7", "7", "7", "8", "8", "9", "9", "10", "10", "15", "18", "18", "18", "18", "21", "22", "22", "25", "25", "25", "25", "25", "26", "26", "27", "27", "28", "28", "29", "29", "29", "30", "30", "31", "31", "33", "33", "34", "36", "37", "39", "39", "39", "40", "41", "41", "42", "43", "43", "47", "47", "49", "50", "50", "52", "52", "53", "54", "55", "55", "55", "56", "57", "57", "59", "60", "61", "62", "63", "67", "71", "71", "72", "72", "73", "74", "75", "78", "79", "80", "80", "82", "89", "89", "89", "91", "91", "92", "92", "93", "93", "94", "97", "97", "99"}, + }, + "table file": { + filename: "testdata/table.shuffled.tsv", + expectedOutput: tableSorted, }, } @@ -65,7 +75,11 @@ func Test(t *testing.T) { for chunkSize := 1; chunkSize < 152; chunkSize += 10 { for bufferSize := 1; bufferSize < 152; bufferSize += 10 { fI, chunkPaths := prepareChunks(t, filename, chunkSize) - got, err := fI.MergeSort(chunkPaths, bufferSize) + elements, err := fI.MergeSort(chunkPaths, bufferSize) + got := make([]string, 0, len(elements)) + for _, e := range elements { + got = append(got, e.Value()) + } assert.ElementsMatch(t, got, expectedOutput) assert.True(t, errors.Is(err, expectedErr)) } diff --git a/testdata/table.shuffled.tsv b/testdata/table.shuffled.tsv new file mode 100644 index 0000000..afa4476 --- /dev/null +++ b/testdata/table.shuffled.tsv @@ -0,0 +1,104 @@ +69 2_qwertsdjh sr1u11 swss72117873 xxxxxx::xxxxxx:track:5ctrY2rmphwswfrbxtrtuw ss wassvwssw r1tvst 1rs 260 75.0000 +22 22_qwertsdjh 2b92p0 sssdw1800011 sdsd1212313::1446817573 2ahs 2sush1 214 73.3400 +61 27_qwertsdjh r2372d rtd039501700 pandora::19789159 rÉrwus s1 rwuus ru1s2s hasw1ta 287 50.0000 +89 50_qwertsdjh w9565d 1vss70900628 gtl::87445503 wrwÖhs sasshvs 2ws rÖtvwstwsst 182 37.5000 +47 27_qwertsdjh a71591 ps6a42178444 pandora::126424921 aara aara ss sh1sats 2s vw uaw rtswdY 222 100.0000 +3 2_qwertsdjh r6420v pd9Y21757715 xxxxxx::xxxxxx:track:0rjwsgiwsmhehgbarn1s27 rtY rat wsua s1-ru1ss,1rs awr1wwa,u1bwww ah 21 9s1v 235 50.0000 +48 27_qwertsdjh 28050d sw15r1200394 pandora::23879758 21s1ws ts2 222 32.5000 +66 15_qwertsdjh r17vua huw762000295 sdsdsdsd::b-kw9wd89ys r1Y vaa ssrw tb1ws 163 50.0000 +86 22_qwertsdjh d19162 sws4r0725625 sdsd1212313::307189094 wvatY ar uabs (s1t1ass bstwwah) wvwhwt1Y sswwr 148 50.0000 +16 50_qwertsdjh 97409r 2sss32000248 gtl::108047090 9s twwwv 91rs wwwtsh ww 317 58.3300 +52 23_qwertsdjh 918pm7 vr19w1272094 ajdhkhdqjh::9008shs4as 9u1rs wwssa aaat s1hw aawwah 306 100.0000 +45 30_qwertsdjh 9b9sw9 swsYw1103138 tidal::61275086 9sbstuY swuuw 1.rw1u 162 45.0000 +36 22_qwertsdjh h23wa5 pdhrs2083605 sdsd1212313::1542132188 havwwhw 1v 1uu vws wu1s ssv1vwahw 274 20.0000 +23 23_qwertsdjh w38363 swhtt0400336 ajdhkhdqjh::900wpdsss6 1 wvatY 1v vwtss 1rw 233 100.0000 +104 27_qwertsdjh d8568v vr1r11439569 pandora::64481461 9sh2wrwah s1sw swwssuwva 168 50.0000 +82 22_qwertsdjh t14119 2s9u60409626 sdsd1212313::777322913 twswhwau2 wt1bs 2wwwst 304 55.2900 +28 15_qwertsdjh w95464 w92812002376 sdsdsdsd::wa5w85riun4 wv1ttY hwwwv vt1hrs 919Y usuu19Y 272 5.0000 +67 15_qwertsdjh vr770a sw6641900012 sdsdsdsd::spwyurwu9rg vwwhs w ws1t2 1 tssast rtwwvwh1 pssw121 247 50.0000 +10 50_qwertsdjh w09926 r1s460423164 gtl::3133751 ww1ss Yast 9aavY rashv2arh swp s1wvstw 190 75.0000 +41 22_qwertsdjh ab99su swws20350142 sdsd1212313::4420113 a1t1haw2 w1srY sahsY 207 100.0000 +34 2_qwertsdjh r1049Y 9ttws7800038 xxxxxx::xxxxxx:track:17uissd0wrfeptenj1s05p r121w uswd ssua2w1 228 50.0000 +26 23_qwertsdjh d67261 2saw81401284 ajdhkhdqjh::907sw92t6t 21hrs rwvw ss vas habY & wwsahhs raaast 457 70.0000 +88 10_qwertsdjh rb9bvw sw58u1225309 qobuz::48467053 vws r1t wh 1a1tvsshv 1812 wwubst msrw 120 33.3300 +32 2_qwertsdjh w25pa3 w9rvr1000445 xxxxxx::xxxxxx:track:0vny8j5qups0yo4tj377th wassvwwhw va 9s 2ahs s1wvst au1h whr 201 50.0000 +87 22_qwertsdjh s0710p au1460400403 sdsd1212313::1465920034 sswswd at1r2Ę sÓrwĆ saww, ms2sst, m1Źr1 248 33.3200 +101 45_qwertsdjh 915446 9sr099800304 melodyvr::vra.26626553 9wvwus w19w9 sawvÉ 356 50.0000 +78 2_qwertsdjh r139sv hu2911801894 xxxxxx::xxxxxx:track:4175seztwgsxqadYxgf99z 21hdst - 140 9as ratsasv bstwwah 2m ratsasv whwvtsrvat 203 50.0000 +37 22_qwertsdjh w71312 wsrb91301601 sdsd1212313::647924566 w1ssst ar vws wa2w: aashwhw vwvus 9shm1swh r1uurwwrw 144 50.0000 +44 17_qwertsdjh 9979mr audiomack::469772 9arvwsw r twvvd Ysu1raur 2m a1su 262 25.0000 +71 22_qwertsdjh 901407 pdwh92150458 sdsd1212313::1572040800 919Y hswv1 154 7.5000 +46 30_qwertsdjh rb9Ya7 swpY51238169 tidal::86056107 ratu2rw2s w1uuww rusppp 194 50.0000 +42 22_qwertsdjh rb72p4 hu9Y91000092 sdsd1212313::986079123 r1uust w2 hwr r1hrwsuuw 476 50.0000 +29 15_qwertsdjh r94rrp 9p1dY2000011 sdsdsdsd::tfrs42ot92s rat1ÇÃa 2s wsua usrw1ha bwauswta s msuw1ha 21s1w 181 2.0000 +56 17_qwertsdjh 7s4w8Y audiomack::8016659 7.62 (r1wv) Yrh usrrw 149 75.0000 +13 30_qwertsdjh r6062s sYws19510027 tidal::32773189 rwhv1 bstwwss wwws1 308 100.0000 +33 2_qwertsdjh hr4s17 pdhrt2016808 xxxxxx::xxxxxx:track:7bo4rs4spmY83wwmj9gdwm h 21 waav vt1$sw,Ywh vt1aa 147 87.5000 +62 27_qwertsdjh wb2sb7 psrvs1400012 pandora::58993839 ww wv ratvw wv? (wshs1h ratu2) b1twasw 1tvwwvw|21sw1h s1tusY 277 75.0000 +91 30_qwertsdjh a5502s 9twss9700316 tidal::5118600 a 21 bw21 (va21 u1 bw21) 2aswha 239 37.5000 +38 22_qwertsdjh h4894d sw1561385905 sdsd1212313::1260823556 hsbst 91rs 2arh (rs1v. r1ww1rh & tssw) t1w 2www1 263 66.6500 +64 23_qwertsdjh vb998h sw8s61400001 ajdhkhdqjh::900ua0rwda 2:161s (rs1v. uYtwr mahsw) tY1h swuwats 241 100.0000 +49 23_qwertsdjh s0704w wsap11300762 ajdhkhdqjh::907w3suY23 sh2a b1twasw 1tvwwvw 187 50.0000 +92 30_qwertsdjh r39265 sw4t31425489 tidal::39491477 ratsbst vtaaa & 1u s1r 210 33.3300 +53 15_qwertsdjh p90bhY sw1012059277 sdsdsdsd::bwdsprjmr1Y psstw2a 1swwa a1sua 229 60.0000 +94 27_qwertsdjh 905757 w9ssm0800307 pandora::245005 9svvst vwwhww s1twhs aaur1tv 234 15.0000 +14 27_qwertsdjh s17910 swsw10910432 pandora::23915001 w swww Yas 91wwwshvst 335 16.6700 +39 22_qwertsdjh wb85rs wvr941100228 sdsd1212313::435903489 wau2sh rwwu2 1s1ds2 332 75.0000 +30 15_qwertsdjh rb9wm6 vr12w1748546 sdsdsdsd::uweY5ww06wc ratwwbs sbstuwrs 232 100.0000 +25 23_qwertsdjh 25488s sw5021901413 ajdhkhdqjh::9089h1whdm 2aw wat1w s1w (tss1wvstwd12a) uaw ts9su2sw 242 50.0000 +40 22_qwertsdjh s13s5d hur711812807 sdsd1212313::1441760307 s1Y9s hav m1h 9uaspbwwv 218 50.0000 +55 15_qwertsdjh w4688h swws90654910 sdsdsdsd::cg19wvwyiac wsv 1r1Y rtas vws raaw s.a.v.a. 119 50.0000 +20 22_qwertsdjh u92ppv 9sa012130008 sdsd1212313::1576690692 uwbswvts1s awr1t 1h2 vws raur 247 25.0000 +59 26_qwertsdjh r86898 sw1561124650 trebel::18523836 rtsr uabs (2t1ss rs1v. vws rsssh2 vtw9svs) vws 2ts1s vs1s 208 0.6500 +27 23_qwertsdjh ub8h9a rtw711600090 ajdhkhdqjh::901r6hw5tr u1 awwrwhs (u'wsaÉt1vtwrs tsswp) wYahauabs 300 50.0000 +11 45_qwertsdjh w6311r sw9rd2000732 melodyvr::vra.604712996 wa2 wa uabs2 uwrsr1Y ratwwwa 238 37.5000 +6 25_qwertsdjh 99431b sw4s41308606 iheartradio::35881602 whvstbwsr '9s1wv rwvwwh' wh vwww sasshv 47 2.5000 +12 26_qwertsdjh vs5b2h 2ss071200425 trebel::110887615 v1ss 1 uwvvus vwss va 2a twwwv ts2Y t1Y saats 113 75.0000 +99 11_qwertsdjh 26809w haswp2018120 idagio::39770836haswp2018120 2swuww st 2sh wwsssu 9uÅ whwst-uwws suwts2, s1twv vØh2su 9a2w9stw rsY2s, sbwh2suwws wvs2shvstw w1hwratshwhw 326 50.0000 +72 22_qwertsdjh rb9ars sw35s0410906 sdsd1212313::54533555 r1wwhw w1aaY wa uwrsY 250 75.0000 +81 2_qwertsdjh a02689 rw6540863851 xxxxxx::xxxxxx:track:6lb15ss0urwnywbrum7dz0 au2 rs1shsww (atwwwh1uuY astratss2 9Y v1hY1 vsrsst) - s1t1ass bstwwah wwv vshsw s1t1ass 213 12.5000 +70 2_qwertsdjh s19ap0 swss71923044 xxxxxx::xxxxxx:track:5des269wldorq3swa9cpcs s1wY - tsswp mw1Y ratvsd,adsh1 220 91.2500 +98 15_qwertsdjh vs7wss sw2p60900920 sdsdsdsd::9r9v9sslp2o rtssr1Y swuust bwaushv m 317 50.0000 +50 23_qwertsdjh as8aaY 2s4Y12009601 ajdhkhdqjh::90898s7s4a a1ast ru12ww1u 154 70.0000 +65 15_qwertsdjh w18301 sw6690610001 sdsdsdsd::sw4ekh3hr1g whvta tars|m1wtars'h ata2srvwahw 102 33.3300 +60 13_qwertsdjh s5988Y w9ss71105464 deezer::668712592 swt1rus ratsst wsastws1bY 249 40.0000 +4 2_qwertsdjh a6451w sw6bp1170243 xxxxxx::xxxxxx:track:2xsrpuyo23ag6ctrklx10w a1sws - 2tss & 91ww bstwwah aaa s1wwrw1hw 163 26.6600 +9 50_qwertsdjh abr60v pstvm1900703 gtl::97445662 au1Y w1t2st s1rwsu sahv1ha 221 99.0000 +68 2_qwertsdjh 997s72 r17rb1800007 xxxxxx::xxxxxx:track:20h6qn1pzkakwmnh6vwmb1 9w 9w 9ap bwwwvÈh 234 10.0000 +54 15_qwertsdjh r927vt swu4t2024809 sdsdsdsd::ju69v6svsz8 r1hb1w m1Y swhwst 210 10.0000 +73 22_qwertsdjh wb7uss 1vh261454510 sdsd1212313::877713041 wtssh-sYs2 1s9stw1h 21rh 268 42.5000 +17 22_qwertsdjh r8127d w9ss71104013 sdsd1212313::1538737219 rwvY wtttu (rs1v. wwwah) rww 264 5.0000 +85 22_qwertsdjh w2172Y 1t1820800190 sdsd1212313::418470246 vws w1t2sh wrssv uwvvus 91h2 282 24.7400 +2 2_qwertsdjh w3893w w9srv0500144 xxxxxx::xxxxxx:track:2wlagmby9Y2s8ssqz1awwm ww2sah Yasvw 191www 1uu wv1tw 195 100.0000 +51 23_qwertsdjh w3718b 2sh920300077 ajdhkhdqjh::900856wwar wuw2s9 vws 2dww1h & s1swsh atrwswvt1 495 50.0000 +103 30_qwertsdjh r30478 w99ws8800408 tidal::110958828 rwstwr m1uw ssw1 m1r1t1 514 25.0000 +75 22_qwertsdjh w09926 2s9u61062795 sdsd1212313::375063825 ww1ss Yast 9a2Y (vssvahws bwwwah swp) vsusrshsY 408 75.0000 +7 20_qwertsdjh 99619a swss71308527 livexlive::ssw-1943101 #9s1svwrsu s1tw1w r1tsY 199 6.0000 +96 15_qwertsdjh a6451w psbtt1230072 sdsdsdsd::h2rj6h-c9wp a1sws t & 9 rw1tvwv1tw 181 26.6600 +15 50_qwertsdjh sb7m8w sw7wr1555821 gtl::115028663 sh1 s1w mswwY m 217 66.6600 +95 23_qwertsdjh w1243s w9aw81529126 ajdhkhdqjh::9015aw2rps wau2whw ah (124 9as) ratsasv m1sw 316 25.0000 +21 22_qwertsdjh Y27190 2sr871209127 sdsd1212313::581470070 Yas r1h'v ww2s rtas Yastwsur (rs1v. asYvah) [sw2 s1wwwbs 2s9 swp] a1su w1t2hst 436 15.6200 +74 22_qwertsdjh d8536h rar010024407 sdsd1212313::1531918169 u1 r1aahst1 1urts2a wsvwsttsd 291 100.0000 +35 22_qwertsdjh r31377 swwd20818023 sdsd1212313::296042772 rssu vws rshs (ah sY 9a2Y) s, m 355 25.0000 +90 30_qwertsdjh wb95bw sw4s41631207 tidal::61932110 wattY hav wattY wsswhw wYh2tass 214 2.0000 +1 2_qwertsdjh wr9mht pswt32038728 xxxxxx::xxxxxx:track:6ptswsa5z5eubh9sawbcs5 w19sw pss vs pswsta rt1h r, mah1s1a1dwa, a19ua rwwuu - s 225 100.0000 +93 27_qwertsdjh s16wuh str712000004 pandora::94806377 sabwsw 2sbwv1 116 40.0000 +43 10_qwertsdjh r39265 swr911514890 qobuz::46805323 ratsbst (rs1v. w1hvwwau2 & 1wv) w uabs s1sahhsh 280 33.3300 +79 2_qwertsdjh 9b1ssY xxxxxx::xxxxxx:track:22i0v22w6Yshhcar5dw5pm 9sth wv 1uu 1r1Y wrauuar 216 30.0000 +76 22_qwertsdjh t93u11 2saw82022224 sdsd1212313::1545333643 tssss9st Yas wst1wh1 vsuuw 268 60.0000 +97 15_qwertsdjh r2337v pdsw51954661 sdsdsdsd::o3Ysi0219is rw1v'w Yast h1ss (rs1v. ruas2$) rwssvt1wuw 173 10.0000 +24 23_qwertsdjh t1802a huw620468562 ajdhkhdqjh::9005r9rpa1 t12wa raa1h b1twasw 1tvwwvw 265 80.0000 +5 20_qwertsdjh 11510w swws22058343 livexlive::wsp-56728999 1ts rs vwsts Ysv? rarassuah 148 0.1000 +57 45_qwertsdjh rb9sa1 ps6h21738621 melodyvr::vra.268417020 r1uuwhw wwddu1 236 50.0000 +63 23_qwertsdjh w3526w sw32r1710388 ajdhkhdqjh::9076r84Y6u Yas wwasu2 wa b1twasw 1tvwwvw 86 12.5000 +100 5_qwertsdjh ub91ss w9bsv0700100 soundcloud::736209364 ushrw 9ap (uwbs) s1twuYh s1hwah 598 75.0000 +19 22_qwertsdjh r60093 r1w370710124 sdsd1212313::212907153 rwrs ar 91vw'w atauawss 9191 9twhss1h 119 50.0000 +8 5_qwertsdjh u06012 spr551405074 soundcloud::773128648 u1 hswt1 rsuwh1 wsast wtsaa rauas9w1 297 100.0000 +84 22_qwertsdjh w1259w swsw12100515 sdsd1212313::1586741468 wt1rs ssuuY atwrs 208 50.0000 +102 26_qwertsdjh r52150 ps4vr2145859 trebel::117007958 rwaus uavv1 ss vt1rY u1rtshrs 247 33.3300 +18 22_qwertsdjh w73602 2sh960100423 sdsd1212313::1452653119 wYsawahY ha. 3 wh 2 swhat / av. 2: 3. rasa2a. wrwstd1h2a. awhs w1wv rwshst awwuw1tsahwsst, awstts 9asusd 996 100.0000 +31 2_qwertsdjh wr4Y72 swrsY1222270 xxxxxx::xxxxxx:track:63t02lqk5gjjb2trnpc3i3 w 1s 1 hsr rts1vwah - wauwv-vt1rs sws swvda vwasawah, s1tsh swvda wwu2st9t1h2 154 50.0000 +80 2_qwertsdjh p07221 swsu19900510 xxxxxx::xxxxxx:track:78ppwnubrvuy35b65vrlz0 pss uarst1 sh1sat1tss 2s vw s22ws w1hvw1wa 309 100.0000 +58 45_qwertsdjh 295a2p swp2t2011404 melodyvr::vra.552184754 2wss vÚ adsh1 & 1hssu 11 178 4.0000 +77 10_qwertsdjh s03121 swwt10029006 qobuz::53680325 sh9ts1s19us m1ssw hsrvah war1t2 203 41.0000 +83 22_qwertsdjh s24342 2sh061301383 sdsd1212313::653518482 s1tw1 (w uwss wv uas2) [tss1wvsts2] wraavst 267 25.0000 diff --git a/testdata/table.sorted.tsv b/testdata/table.sorted.tsv new file mode 100644 index 0000000..28ab833 --- /dev/null +++ b/testdata/table.sorted.tsv @@ -0,0 +1,104 @@ +1 2_qwertsdjh wr9mht pswt32038728 xxxxxx::xxxxxx:track:6ptswsa5z5eubh9sawbcs5 w19sw pss vs pswsta rt1h r, mah1s1a1dwa, a19ua rwwuu - s 225 100.0000 +2 2_qwertsdjh w3893w w9srv0500144 xxxxxx::xxxxxx:track:2wlagmby9Y2s8ssqz1awwm ww2sah Yasvw 191www 1uu wv1tw 195 100.0000 +3 2_qwertsdjh r6420v pd9Y21757715 xxxxxx::xxxxxx:track:0rjwsgiwsmhehgbarn1s27 rtY rat wsua s1-ru1ss,1rs awr1wwa,u1bwww ah 21 9s1v 235 50.0000 +4 2_qwertsdjh a6451w sw6bp1170243 xxxxxx::xxxxxx:track:2xsrpuyo23ag6ctrklx10w a1sws - 2tss & 91ww bstwwah aaa s1wwrw1hw 163 26.6600 +5 20_qwertsdjh 11510w swws22058343 livexlive::wsp-56728999 1ts rs vwsts Ysv? rarassuah 148 0.1000 +6 25_qwertsdjh 99431b sw4s41308606 iheartradio::35881602 whvstbwsr '9s1wv rwvwwh' wh vwww sasshv 47 2.5000 +7 20_qwertsdjh 99619a swss71308527 livexlive::ssw-1943101 #9s1svwrsu s1tw1w r1tsY 199 6.0000 +8 5_qwertsdjh u06012 spr551405074 soundcloud::773128648 u1 hswt1 rsuwh1 wsast wtsaa rauas9w1 297 100.0000 +9 50_qwertsdjh abr60v pstvm1900703 gtl::97445662 au1Y w1t2st s1rwsu sahv1ha 221 99.0000 +10 50_qwertsdjh w09926 r1s460423164 gtl::3133751 ww1ss Yast 9aavY rashv2arh swp s1wvstw 190 75.0000 +11 45_qwertsdjh w6311r sw9rd2000732 melodyvr::vra.604712996 wa2 wa uabs2 uwrsr1Y ratwwwa 238 37.5000 +12 26_qwertsdjh vs5b2h 2ss071200425 trebel::110887615 v1ss 1 uwvvus vwss va 2a twwwv ts2Y t1Y saats 113 75.0000 +13 30_qwertsdjh r6062s sYws19510027 tidal::32773189 rwhv1 bstwwss wwws1 308 100.0000 +14 27_qwertsdjh s17910 swsw10910432 pandora::23915001 w swww Yas 91wwwshvst 335 16.6700 +15 50_qwertsdjh sb7m8w sw7wr1555821 gtl::115028663 sh1 s1w mswwY m 217 66.6600 +16 50_qwertsdjh 97409r 2sss32000248 gtl::108047090 9s twwwv 91rs wwwtsh ww 317 58.3300 +17 22_qwertsdjh r8127d w9ss71104013 sdsd1212313::1538737219 rwvY wtttu (rs1v. wwwah) rww 264 5.0000 +18 22_qwertsdjh w73602 2sh960100423 sdsd1212313::1452653119 wYsawahY ha. 3 wh 2 swhat / av. 2: 3. rasa2a. wrwstd1h2a. awhs w1wv rwshst awwuw1tsahwsst, awstts 9asusd 996 100.0000 +19 22_qwertsdjh r60093 r1w370710124 sdsd1212313::212907153 rwrs ar 91vw'w atauawss 9191 9twhss1h 119 50.0000 +20 22_qwertsdjh u92ppv 9sa012130008 sdsd1212313::1576690692 uwbswvts1s awr1t 1h2 vws raur 247 25.0000 +21 22_qwertsdjh Y27190 2sr871209127 sdsd1212313::581470070 Yas r1h'v ww2s rtas Yastwsur (rs1v. asYvah) [sw2 s1wwwbs 2s9 swp] a1su w1t2hst 436 15.6200 +22 22_qwertsdjh 2b92p0 sssdw1800011 sdsd1212313::1446817573 2ahs 2sush1 214 73.3400 +23 23_qwertsdjh w38363 swhtt0400336 ajdhkhdqjh::900wpdsss6 1 wvatY 1v vwtss 1rw 233 100.0000 +24 23_qwertsdjh t1802a huw620468562 ajdhkhdqjh::9005r9rpa1 t12wa raa1h b1twasw 1tvwwvw 265 80.0000 +25 23_qwertsdjh 25488s sw5021901413 ajdhkhdqjh::9089h1whdm 2aw wat1w s1w (tss1wvstwd12a) uaw ts9su2sw 242 50.0000 +26 23_qwertsdjh d67261 2saw81401284 ajdhkhdqjh::907sw92t6t 21hrs rwvw ss vas habY & wwsahhs raaast 457 70.0000 +27 23_qwertsdjh ub8h9a rtw711600090 ajdhkhdqjh::901r6hw5tr u1 awwrwhs (u'wsaÉt1vtwrs tsswp) wYahauabs 300 50.0000 +28 15_qwertsdjh w95464 w92812002376 sdsdsdsd::wa5w85riun4 wv1ttY hwwwv vt1hrs 919Y usuu19Y 272 5.0000 +29 15_qwertsdjh r94rrp 9p1dY2000011 sdsdsdsd::tfrs42ot92s rat1ÇÃa 2s wsua usrw1ha bwauswta s msuw1ha 21s1w 181 2.0000 +30 15_qwertsdjh rb9wm6 vr12w1748546 sdsdsdsd::uweY5ww06wc ratwwbs sbstuwrs 232 100.0000 +31 2_qwertsdjh wr4Y72 swrsY1222270 xxxxxx::xxxxxx:track:63t02lqk5gjjb2trnpc3i3 w 1s 1 hsr rts1vwah - wauwv-vt1rs sws swvda vwasawah, s1tsh swvda wwu2st9t1h2 154 50.0000 +32 2_qwertsdjh w25pa3 w9rvr1000445 xxxxxx::xxxxxx:track:0vny8j5qups0yo4tj377th wassvwwhw va 9s 2ahs s1wvst au1h whr 201 50.0000 +33 2_qwertsdjh hr4s17 pdhrt2016808 xxxxxx::xxxxxx:track:7bo4rs4spmY83wwmj9gdwm h 21 waav vt1$sw,Ywh vt1aa 147 87.5000 +34 2_qwertsdjh r1049Y 9ttws7800038 xxxxxx::xxxxxx:track:17uissd0wrfeptenj1s05p r121w uswd ssua2w1 228 50.0000 +35 22_qwertsdjh r31377 swwd20818023 sdsd1212313::296042772 rssu vws rshs (ah sY 9a2Y) s, m 355 25.0000 +36 22_qwertsdjh h23wa5 pdhrs2083605 sdsd1212313::1542132188 havwwhw 1v 1uu vws wu1s ssv1vwahw 274 20.0000 +37 22_qwertsdjh w71312 wsrb91301601 sdsd1212313::647924566 w1ssst ar vws wa2w: aashwhw vwvus 9shm1swh r1uurwwrw 144 50.0000 +38 22_qwertsdjh h4894d sw1561385905 sdsd1212313::1260823556 hsbst 91rs 2arh (rs1v. r1ww1rh & tssw) t1w 2www1 263 66.6500 +39 22_qwertsdjh wb85rs wvr941100228 sdsd1212313::435903489 wau2sh rwwu2 1s1ds2 332 75.0000 +40 22_qwertsdjh s13s5d hur711812807 sdsd1212313::1441760307 s1Y9s hav m1h 9uaspbwwv 218 50.0000 +41 22_qwertsdjh ab99su swws20350142 sdsd1212313::4420113 a1t1haw2 w1srY sahsY 207 100.0000 +42 22_qwertsdjh rb72p4 hu9Y91000092 sdsd1212313::986079123 r1uust w2 hwr r1hrwsuuw 476 50.0000 +43 10_qwertsdjh r39265 swr911514890 qobuz::46805323 ratsbst (rs1v. w1hvwwau2 & 1wv) w uabs s1sahhsh 280 33.3300 +44 17_qwertsdjh 9979mr audiomack::469772 9arvwsw r twvvd Ysu1raur 2m a1su 262 25.0000 +45 30_qwertsdjh 9b9sw9 swsYw1103138 tidal::61275086 9sbstuY swuuw 1.rw1u 162 45.0000 +46 30_qwertsdjh rb9Ya7 swpY51238169 tidal::86056107 ratu2rw2s w1uuww rusppp 194 50.0000 +47 27_qwertsdjh a71591 ps6a42178444 pandora::126424921 aara aara ss sh1sats 2s vw uaw rtswdY 222 100.0000 +48 27_qwertsdjh 28050d sw15r1200394 pandora::23879758 21s1ws ts2 222 32.5000 +49 23_qwertsdjh s0704w wsap11300762 ajdhkhdqjh::907w3suY23 sh2a b1twasw 1tvwwvw 187 50.0000 +50 23_qwertsdjh as8aaY 2s4Y12009601 ajdhkhdqjh::90898s7s4a a1ast ru12ww1u 154 70.0000 +51 23_qwertsdjh w3718b 2sh920300077 ajdhkhdqjh::900856wwar wuw2s9 vws 2dww1h & s1swsh atrwswvt1 495 50.0000 +52 23_qwertsdjh 918pm7 vr19w1272094 ajdhkhdqjh::9008shs4as 9u1rs wwssa aaat s1hw aawwah 306 100.0000 +53 15_qwertsdjh p90bhY sw1012059277 sdsdsdsd::bwdsprjmr1Y psstw2a 1swwa a1sua 229 60.0000 +54 15_qwertsdjh r927vt swu4t2024809 sdsdsdsd::ju69v6svsz8 r1hb1w m1Y swhwst 210 10.0000 +55 15_qwertsdjh w4688h swws90654910 sdsdsdsd::cg19wvwyiac wsv 1r1Y rtas vws raaw s.a.v.a. 119 50.0000 +56 17_qwertsdjh 7s4w8Y audiomack::8016659 7.62 (r1wv) Yrh usrrw 149 75.0000 +57 45_qwertsdjh rb9sa1 ps6h21738621 melodyvr::vra.268417020 r1uuwhw wwddu1 236 50.0000 +58 45_qwertsdjh 295a2p swp2t2011404 melodyvr::vra.552184754 2wss vÚ adsh1 & 1hssu 11 178 4.0000 +59 26_qwertsdjh r86898 sw1561124650 trebel::18523836 rtsr uabs (2t1ss rs1v. vws rsssh2 vtw9svs) vws 2ts1s vs1s 208 0.6500 +60 13_qwertsdjh s5988Y w9ss71105464 deezer::668712592 swt1rus ratsst wsastws1bY 249 40.0000 +61 27_qwertsdjh r2372d rtd039501700 pandora::19789159 rÉrwus s1 rwuus ru1s2s hasw1ta 287 50.0000 +62 27_qwertsdjh wb2sb7 psrvs1400012 pandora::58993839 ww wv ratvw wv? (wshs1h ratu2) b1twasw 1tvwwvw|21sw1h s1tusY 277 75.0000 +63 23_qwertsdjh w3526w sw32r1710388 ajdhkhdqjh::9076r84Y6u Yas wwasu2 wa b1twasw 1tvwwvw 86 12.5000 +64 23_qwertsdjh vb998h sw8s61400001 ajdhkhdqjh::900ua0rwda 2:161s (rs1v. uYtwr mahsw) tY1h swuwats 241 100.0000 +65 15_qwertsdjh w18301 sw6690610001 sdsdsdsd::sw4ekh3hr1g whvta tars|m1wtars'h ata2srvwahw 102 33.3300 +66 15_qwertsdjh r17vua huw762000295 sdsdsdsd::b-kw9wd89ys r1Y vaa ssrw tb1ws 163 50.0000 +67 15_qwertsdjh vr770a sw6641900012 sdsdsdsd::spwyurwu9rg vwwhs w ws1t2 1 tssast rtwwvwh1 pssw121 247 50.0000 +68 2_qwertsdjh 997s72 r17rb1800007 xxxxxx::xxxxxx:track:20h6qn1pzkakwmnh6vwmb1 9w 9w 9ap bwwwvÈh 234 10.0000 +69 2_qwertsdjh sr1u11 swss72117873 xxxxxx::xxxxxx:track:5ctrY2rmphwswfrbxtrtuw ss wassvwssw r1tvst 1rs 260 75.0000 +70 2_qwertsdjh s19ap0 swss71923044 xxxxxx::xxxxxx:track:5des269wldorq3swa9cpcs s1wY - tsswp mw1Y ratvsd,adsh1 220 91.2500 +71 22_qwertsdjh 901407 pdwh92150458 sdsd1212313::1572040800 919Y hswv1 154 7.5000 +72 22_qwertsdjh rb9ars sw35s0410906 sdsd1212313::54533555 r1wwhw w1aaY wa uwrsY 250 75.0000 +73 22_qwertsdjh wb7uss 1vh261454510 sdsd1212313::877713041 wtssh-sYs2 1s9stw1h 21rh 268 42.5000 +74 22_qwertsdjh d8536h rar010024407 sdsd1212313::1531918169 u1 r1aahst1 1urts2a wsvwsttsd 291 100.0000 +75 22_qwertsdjh w09926 2s9u61062795 sdsd1212313::375063825 ww1ss Yast 9a2Y (vssvahws bwwwah swp) vsusrshsY 408 75.0000 +76 22_qwertsdjh t93u11 2saw82022224 sdsd1212313::1545333643 tssss9st Yas wst1wh1 vsuuw 268 60.0000 +77 10_qwertsdjh s03121 swwt10029006 qobuz::53680325 sh9ts1s19us m1ssw hsrvah war1t2 203 41.0000 +78 2_qwertsdjh r139sv hu2911801894 xxxxxx::xxxxxx:track:4175seztwgsxqadYxgf99z 21hdst - 140 9as ratsasv bstwwah 2m ratsasv whwvtsrvat 203 50.0000 +79 2_qwertsdjh 9b1ssY xxxxxx::xxxxxx:track:22i0v22w6Yshhcar5dw5pm 9sth wv 1uu 1r1Y wrauuar 216 30.0000 +80 2_qwertsdjh p07221 swsu19900510 xxxxxx::xxxxxx:track:78ppwnubrvuy35b65vrlz0 pss uarst1 sh1sat1tss 2s vw s22ws w1hvw1wa 309 100.0000 +81 2_qwertsdjh a02689 rw6540863851 xxxxxx::xxxxxx:track:6lb15ss0urwnywbrum7dz0 au2 rs1shsww (atwwwh1uuY astratss2 9Y v1hY1 vsrsst) - s1t1ass bstwwah wwv vshsw s1t1ass 213 12.5000 +82 22_qwertsdjh t14119 2s9u60409626 sdsd1212313::777322913 twswhwau2 wt1bs 2wwwst 304 55.2900 +83 22_qwertsdjh s24342 2sh061301383 sdsd1212313::653518482 s1tw1 (w uwss wv uas2) [tss1wvsts2] wraavst 267 25.0000 +84 22_qwertsdjh w1259w swsw12100515 sdsd1212313::1586741468 wt1rs ssuuY atwrs 208 50.0000 +85 22_qwertsdjh w2172Y 1t1820800190 sdsd1212313::418470246 vws w1t2sh wrssv uwvvus 91h2 282 24.7400 +86 22_qwertsdjh d19162 sws4r0725625 sdsd1212313::307189094 wvatY ar uabs (s1t1ass bstwwah) wvwhwt1Y sswwr 148 50.0000 +87 22_qwertsdjh s0710p au1460400403 sdsd1212313::1465920034 sswswd at1r2Ę sÓrwĆ saww, ms2sst, m1Źr1 248 33.3200 +88 10_qwertsdjh rb9bvw sw58u1225309 qobuz::48467053 vws r1t wh 1a1tvsshv 1812 wwubst msrw 120 33.3300 +89 50_qwertsdjh w9565d 1vss70900628 gtl::87445503 wrwÖhs sasshvs 2ws rÖtvwstwsst 182 37.5000 +90 30_qwertsdjh wb95bw sw4s41631207 tidal::61932110 wattY hav wattY wsswhw wYh2tass 214 2.0000 +91 30_qwertsdjh a5502s 9twss9700316 tidal::5118600 a 21 bw21 (va21 u1 bw21) 2aswha 239 37.5000 +92 30_qwertsdjh r39265 sw4t31425489 tidal::39491477 ratsbst vtaaa & 1u s1r 210 33.3300 +93 27_qwertsdjh s16wuh str712000004 pandora::94806377 sabwsw 2sbwv1 116 40.0000 +94 27_qwertsdjh 905757 w9ssm0800307 pandora::245005 9svvst vwwhww s1twhs aaur1tv 234 15.0000 +95 23_qwertsdjh w1243s w9aw81529126 ajdhkhdqjh::9015aw2rps wau2whw ah (124 9as) ratsasv m1sw 316 25.0000 +96 15_qwertsdjh a6451w psbtt1230072 sdsdsdsd::h2rj6h-c9wp a1sws t & 9 rw1tvwv1tw 181 26.6600 +97 15_qwertsdjh r2337v pdsw51954661 sdsdsdsd::o3Ysi0219is rw1v'w Yast h1ss (rs1v. ruas2$) rwssvt1wuw 173 10.0000 +98 15_qwertsdjh vs7wss sw2p60900920 sdsdsdsd::9r9v9sslp2o rtssr1Y swuust bwaushv m 317 50.0000 +99 11_qwertsdjh 26809w haswp2018120 idagio::39770836haswp2018120 2swuww st 2sh wwsssu 9uÅ whwst-uwws suwts2, s1twv vØh2su 9a2w9stw rsY2s, sbwh2suwws wvs2shvstw w1hwratshwhw 326 50.0000 +100 5_qwertsdjh ub91ss w9bsv0700100 soundcloud::736209364 ushrw 9ap (uwbs) s1twuYh s1hwah 598 75.0000 +101 45_qwertsdjh 915446 9sr099800304 melodyvr::vra.26626553 9wvwus w19w9 sawvÉ 356 50.0000 +102 26_qwertsdjh r52150 ps4vr2145859 trebel::117007958 rwaus uavv1 ss vt1rY u1rtshrs 247 33.3300 +103 30_qwertsdjh r30478 w99ws8800408 tidal::110958828 rwstwr m1uw ssw1 m1r1t1 514 25.0000 +104 27_qwertsdjh d8568v vr1r11439569 pandora::64481461 9sh2wrwah s1sw swwssuwva 168 50.0000 diff --git a/vector/element.go b/vector/element.go new file mode 100644 index 0000000..9d1e40f --- /dev/null +++ b/vector/element.go @@ -0,0 +1,16 @@ +package vector + +type element struct { + line string + i int +} + +func (e *element) Less(other Element) bool { + return e.i < other.(*element).i +} + +func (e *element) Value() string { + return e.line +} + +func (e *element) String() string { return e.line } diff --git a/vector/int_vector.go b/vector/int_vector.go index 0f39150..f54b45f 100644 --- a/vector/int_vector.go +++ b/vector/int_vector.go @@ -12,15 +12,27 @@ var _ Vector = &IntVec{} func AllocateIntVector(size int) Vector { return &IntVec{ - s: make([]int, 0, size), + s: make([]Element, 0, size), } } type IntVec struct { - s []int + s []Element } -func (v *IntVec) Get(i int) interface{} { +func (*IntVec) newElement(value string) *element { + i, err := strconv.Atoi(value) + if err != nil { + panic(errors.Wrap(err, "converting value from string")) + } + + return &element{ + line: value, + i: i, + } +} + +func (v *IntVec) Get(i int) Element { return v.s[i] } @@ -28,44 +40,33 @@ func (v *IntVec) End() int { return len(v.s) } -func (v *IntVec) insert(i int, value interface{}) error { - v.s = append(v.s[:i], append([]int{value.(int)}, v.s[i:]...)...) +func (v *IntVec) insert(i int, value string) error { + v.s = append(v.s[:i], append([]Element{v.newElement(value)}, v.s[i:]...)...) return nil } -func (v *IntVec) PushBack(value interface{}) error { - num, err := strconv.Atoi(value.(string)) - if err != nil { - return err - } - v.s = append(v.s, num) +func (v *IntVec) PushBack(value string) error { + v.s = append(v.s, v.newElement(value)) return nil } -func (v *IntVec) Compare(v1, v2 interface{}) bool { - return v1.(int) >= v2.(int) -} -func (v *IntVec) Less(v1, v2 interface{}) bool { - return v1.(int) < v2.(int) +func (v *IntVec) Less(v1, v2 Element) bool { + return v1.Less(v2) } -func (v *IntVec) convertFromString(value string) (interface{}, error) { - num2, err := strconv.Atoi(value) - if err != nil { - return false, err - } - return num2, err +func (v *IntVec) convertFromString(value string) (Element, error) { + return v.newElement(value), nil } func (v *IntVec) Dump(filename string) error { - file, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0644) + file, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644) if err != nil { return errors.Errorf("failed creating file: %s", err) } datawriter := bufio.NewWriter(file) for _, data := range v.s { - _, err = datawriter.WriteString(strconv.Itoa(data) + "\n") + _, err = datawriter.WriteString(data.Value() + "\n") if err != nil { return errors.Errorf("failed writing file: %s", err) } diff --git a/vector/string_vector.go b/vector/string_vector.go index e091eb9..93b7f4c 100644 --- a/vector/string_vector.go +++ b/vector/string_vector.go @@ -11,15 +11,21 @@ var _ Vector = &StringVec{} func AllocateStringVector(size int) Vector { return &StringVec{ - s: make([]string, 0, size), + s: make([]Element, 0, size), } } type StringVec struct { - s []string + s []Element } -func (v *StringVec) Get(i int) interface{} { +func (*StringVec) newElement(value string) *element { + return &element{ + line: value, + } +} + +func (v *StringVec) Get(i int) Element { return v.s[i] } @@ -27,36 +33,33 @@ func (v *StringVec) End() int { return len(v.s) } -func (v *StringVec) insert(i int, value interface{}) error { - v.s = append(v.s[:i], append([]string{value.(string)}, v.s[i:]...)...) +func (v *StringVec) insert(i int, value string) error { + v.s = append(v.s[:i], append([]Element{v.newElement(value)}, v.s[i:]...)...) return nil } -func (v *StringVec) PushBack(value interface{}) error { - v.s = append(v.s, value.(string)) +func (v *StringVec) PushBack(value string) error { + v.s = append(v.s, v.newElement(value)) return nil } -func (v *StringVec) Compare(v1, v2 interface{}) bool { - return v1.(string) >= v2.(string) -} -func (v *StringVec) Less(v1, v2 interface{}) bool { - return v1.(string) < v2.(string) +func (v *StringVec) Less(v1, v2 Element) bool { + return v1.Less(v2) } -func (v *StringVec) convertFromString(value string) (interface{}, error) { - return value, nil +func (v *StringVec) convertFromString(value string) (Element, error) { + return v.newElement(value), nil } func (v *StringVec) Dump(filename string) error { - file, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0644) + file, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644) if err != nil { return errors.Errorf("failed creating file: %s", err) } datawriter := bufio.NewWriter(file) for _, data := range v.s { - _, err = datawriter.WriteString(data + "\n") + _, err = datawriter.WriteString(data.Value() + "\n") if err != nil { return errors.Errorf("failed writing file: %s", err) } diff --git a/vector/table_vector.go b/vector/table_vector.go new file mode 100644 index 0000000..1ba4fcd --- /dev/null +++ b/vector/table_vector.go @@ -0,0 +1,89 @@ +package vector + +import ( + "bufio" + "os" + "strconv" + "strings" + + "github.com/pkg/errors" +) + +var _ Vector = &TableVec{} + +func AllocateTableVector(sep string, pos int) func(int) Vector { + return func(size int) Vector { + return &TableVec{ + s: make([]Element, 0, size), + sep: sep, + pos: pos, + } + } +} + +type TableVec struct { + sep string + s []Element + pos int +} + +func (v *TableVec) newElement(value string) *element { + num := strings.Split(value, v.sep)[v.pos] + i, err := strconv.Atoi(num) + if err != nil { + panic(errors.Wrap(err, "converting value from string")) + } + + return &element{ + line: value, + i: i, + } +} + +func (v *TableVec) Get(i int) Element { + return v.s[i] +} + +func (v *TableVec) End() int { + return len(v.s) +} + +func (v *TableVec) insert(i int, value string) error { + v.s = append(v.s[:i], append([]Element{v.newElement(value)}, v.s[i:]...)...) + return nil +} + +func (v *TableVec) PushBack(value string) error { + v.s = append(v.s, v.newElement(value)) + return nil +} + +func (v *TableVec) Less(v1, v2 Element) bool { + return v1.Less(v2) +} + +func (v *TableVec) convertFromString(value string) (Element, error) { + return v.newElement(value), nil +} + +func (v *TableVec) Dump(filename string) error { + file, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644) + if err != nil { + return errors.Errorf("failed creating file: %s", err) + } + datawriter := bufio.NewWriter(file) + + for _, data := range v.s { + _, err = datawriter.WriteString(data.Value() + "\n") + if err != nil { + return errors.Errorf("failed writing file: %s", err) + } + } + datawriter.Flush() + file.Close() + return nil +} + +func (v *TableVec) FrontShift() { + v.s = v.s[1:] +} diff --git a/vector/vector.go b/vector/vector.go index 972e023..7afd3bb 100644 --- a/vector/vector.go +++ b/vector/vector.go @@ -1,14 +1,23 @@ package vector -import "sort" +import ( + "fmt" + "sort" + "strings" +) + +type Element interface { + Value() string + Less(other Element) bool +} type Vector interface { // Get Access i-th element - Get(i int) interface{} + Get(i int) Element // PushBack Add item at the end - PushBack(value interface{}) error + PushBack(value string) error // Less Returns wether v1 is smaller than v2 - Less(v1, v2 interface{}) bool + Less(v1, v2 Element) bool // Dump Create a file and store the underluing data Dump(filename string) error // FrontShift Remove the first element @@ -16,19 +25,23 @@ type Vector interface { // End Length of the Vector End() int // insert Insert elements at index i - insert(i int, value interface{}) error + insert(i int, value string) error // convertFromString Convert the line from the file to the expected underlying data - convertFromString(value string) (interface{}, error) + convertFromString(value string) (Element, error) } // Sort Perform a binary search to find where to put a value in a vector. Ascending order. -func Sort(ans Vector, num string) error { - val, err := ans.convertFromString(num) +func Sort(ans Vector, line, sep string, pos int) error { + num := strings.Split(line, sep) + if len(num) < pos { + return fmt.Errorf("could not find position %d in %q", pos, line) + } + val, err := ans.convertFromString(num[pos]) if err != nil { return err } - pos := sort.Search(ans.End(), func(i int) bool { + found := sort.Search(ans.End(), func(i int) bool { return !ans.Less(ans.Get(i), val) }) - return ans.insert(pos, val) + return ans.insert(found, line) } From ef490c7be31ea9f24da75b5491cb17c19f83cc87 Mon Sep 17 00:00:00 2001 From: Arsham Shirvani Date: Fri, 4 Feb 2022 16:40:45 +0000 Subject: [PATCH 2/3] ref(vector): focus the vector logic around a place --- file/chunk.go | 4 +- file/file.go | 6 +-- file/sort.go | 4 +- main_bench_test.go | 3 +- vector/element.go | 10 +++-- vector/int_vector.go | 85 +++++++---------------------------- vector/string_vector.go | 78 ++++---------------------------- vector/table_vector.go | 99 +++++++++-------------------------------- vector/vector.go | 82 ++++++++++++++++++++++------------ 9 files changed, 114 insertions(+), 257 deletions(-) diff --git a/file/chunk.go b/file/chunk.go index 89cdce0..f7055a7 100644 --- a/file/chunk.go +++ b/file/chunk.go @@ -12,7 +12,7 @@ import ( type chunkInfo struct { file *os.File scanner *bufio.Scanner - buffer vector.Vector + buffer *vector.Vector filename string } @@ -36,7 +36,7 @@ type chunks struct { } // new Create a new chunk and initialize it. -func (c *chunks) new(chunkPath string, allocate func(size int) vector.Vector, size int) error { +func (c *chunks) new(chunkPath string, allocate func(size int) *vector.Vector, size int) error { f, err := os.Open(chunkPath) if err != nil { return err diff --git a/file/file.go b/file/file.go index 945e6f4..2ebdabf 100644 --- a/file/file.go +++ b/file/file.go @@ -14,7 +14,7 @@ import ( type Info struct { Reader io.Reader - Allocate func(int) vector.Vector + Allocate func(int) *vector.Vector } // Sort Perform a naive sort of a reader and put the results in ascending order in a Vector. @@ -50,7 +50,7 @@ func (f *Info) CreateSortedChunks(chunkFolder string, dumpSize int) ([]string, e chunkIdx := 0 chunkPaths := []string{} scanner := bufio.NewScanner(f.Reader) - var ans vector.Vector + var ans *vector.Vector for scanner.Scan() { if row%dumpSize == 0 { if row != 0 { @@ -85,7 +85,7 @@ func (f *Info) CreateSortedChunks(chunkFolder string, dumpSize int) ([]string, e return chunkPaths, nil } -func dumpChunk(ans vector.Vector, folder string, chunkIdx int) (string, error) { +func dumpChunk(ans *vector.Vector, folder string, chunkIdx int) (string, error) { fn := "dump chunk" chunkPath := path.Join(folder, "chunk_"+strconv.Itoa(chunkIdx)+".tsv") err := ans.Dump(chunkPath) diff --git a/file/sort.go b/file/sort.go index 26243e0..1a3dd58 100644 --- a/file/sort.go +++ b/file/sort.go @@ -54,13 +54,13 @@ func (f *Info) MergeSort(chunkPaths []string, k int) (output []vector.Element, e output = append(output, minValue) // remove the first element from the chunk we pulled the smallest value minChunk.buffer.FrontShift() - if minChunk.buffer.End() == 0 { + if minChunk.buffer.Len() == 0 { err = minChunk.pullSubset(k) if err != nil { return nil, err } // if after pulling data the chunk buffer is still empty then we can remove it - if minChunk.buffer.End() == 0 { + if minChunk.buffer.Len() == 0 { toShrink = append(toShrink, minIdx) err = chunks.shrink(toShrink) if err != nil { diff --git a/main_bench_test.go b/main_bench_test.go index 0eb6bdb..bc69053 100644 --- a/main_bench_test.go +++ b/main_bench_test.go @@ -1,7 +1,6 @@ package main_test import ( - "io/ioutil" "os" "path" "testing" @@ -31,7 +30,7 @@ func BenchmarkMergeSort(b *testing.B) { _ = err } f.Close() - dir, err := ioutil.ReadDir("testdata/chunks") + dir, err := os.ReadDir("testdata/chunks") assert.NoError(b, err) for _, d := range dir { err = os.RemoveAll(path.Join("testdata/chunks", d.Name())) diff --git a/vector/element.go b/vector/element.go index 9d1e40f..260a2ae 100644 --- a/vector/element.go +++ b/vector/element.go @@ -1,14 +1,16 @@ package vector +// An Element should return the final value that should be retuned when the +// final row is back to the caller. +type Element interface { + Value() string +} + type element struct { line string i int } -func (e *element) Less(other Element) bool { - return e.i < other.(*element).i -} - func (e *element) Value() string { return e.line } diff --git a/vector/int_vector.go b/vector/int_vector.go index f54b45f..56ed872 100644 --- a/vector/int_vector.go +++ b/vector/int_vector.go @@ -1,81 +1,26 @@ package vector import ( - "bufio" - "os" "strconv" "github.com/pkg/errors" ) -var _ Vector = &IntVec{} - -func AllocateIntVector(size int) Vector { - return &IntVec{ +func AllocateIntVector(size int) *Vector { + return &Vector{ s: make([]Element, 0, size), + NewElement: func(value string) Element { + i, err := strconv.Atoi(value) + if err != nil { + panic(errors.Wrap(err, "converting value from string")) + } + + return &element{ + line: value, + i: i, + } + }, + // nolint:forcetypeassert // we already know the type. + Less: func(v1, v2 Element) bool { return v1.(*element).i < v2.(*element).i }, } } - -type IntVec struct { - s []Element -} - -func (*IntVec) newElement(value string) *element { - i, err := strconv.Atoi(value) - if err != nil { - panic(errors.Wrap(err, "converting value from string")) - } - - return &element{ - line: value, - i: i, - } -} - -func (v *IntVec) Get(i int) Element { - return v.s[i] -} - -func (v *IntVec) End() int { - return len(v.s) -} - -func (v *IntVec) insert(i int, value string) error { - v.s = append(v.s[:i], append([]Element{v.newElement(value)}, v.s[i:]...)...) - return nil -} - -func (v *IntVec) PushBack(value string) error { - v.s = append(v.s, v.newElement(value)) - return nil -} - -func (v *IntVec) Less(v1, v2 Element) bool { - return v1.Less(v2) -} - -func (v *IntVec) convertFromString(value string) (Element, error) { - return v.newElement(value), nil -} - -func (v *IntVec) Dump(filename string) error { - file, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644) - if err != nil { - return errors.Errorf("failed creating file: %s", err) - } - datawriter := bufio.NewWriter(file) - - for _, data := range v.s { - _, err = datawriter.WriteString(data.Value() + "\n") - if err != nil { - return errors.Errorf("failed writing file: %s", err) - } - } - datawriter.Flush() - file.Close() - return nil -} - -func (v *IntVec) FrontShift() { - v.s = v.s[1:] -} diff --git a/vector/string_vector.go b/vector/string_vector.go index 93b7f4c..dd1772d 100644 --- a/vector/string_vector.go +++ b/vector/string_vector.go @@ -1,74 +1,14 @@ package vector -import ( - "bufio" - "os" - - "github.com/pkg/errors" -) - -var _ Vector = &StringVec{} - -func AllocateStringVector(size int) Vector { - return &StringVec{ +// AllocateStringVector returns a Vector that can sort based on strings. +func AllocateStringVector(size int) *Vector { + return &Vector{ s: make([]Element, 0, size), + NewElement: func(value string) Element { + return &element{ + line: value, + } + }, + Less: func(v1, v2 Element) bool { return v1.Value() < v2.Value() }, } } - -type StringVec struct { - s []Element -} - -func (*StringVec) newElement(value string) *element { - return &element{ - line: value, - } -} - -func (v *StringVec) Get(i int) Element { - return v.s[i] -} - -func (v *StringVec) End() int { - return len(v.s) -} - -func (v *StringVec) insert(i int, value string) error { - v.s = append(v.s[:i], append([]Element{v.newElement(value)}, v.s[i:]...)...) - return nil -} - -func (v *StringVec) PushBack(value string) error { - v.s = append(v.s, v.newElement(value)) - return nil -} - -func (v *StringVec) Less(v1, v2 Element) bool { - return v1.Less(v2) -} - -func (v *StringVec) convertFromString(value string) (Element, error) { - return v.newElement(value), nil -} - -func (v *StringVec) Dump(filename string) error { - file, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644) - if err != nil { - return errors.Errorf("failed creating file: %s", err) - } - datawriter := bufio.NewWriter(file) - - for _, data := range v.s { - _, err = datawriter.WriteString(data.Value() + "\n") - if err != nil { - return errors.Errorf("failed writing file: %s", err) - } - } - datawriter.Flush() - file.Close() - return nil -} - -func (v *StringVec) FrontShift() { - v.s = v.s[1:] -} diff --git a/vector/table_vector.go b/vector/table_vector.go index 1ba4fcd..4246c22 100644 --- a/vector/table_vector.go +++ b/vector/table_vector.go @@ -1,89 +1,34 @@ package vector import ( - "bufio" - "os" "strconv" "strings" "github.com/pkg/errors" ) -var _ Vector = &TableVec{} - -func AllocateTableVector(sep string, pos int) func(int) Vector { - return func(size int) Vector { - return &TableVec{ - s: make([]Element, 0, size), - sep: sep, - pos: pos, - } - } -} - -type TableVec struct { - sep string - s []Element - pos int -} - -func (v *TableVec) newElement(value string) *element { - num := strings.Split(value, v.sep)[v.pos] - i, err := strconv.Atoi(num) - if err != nil { - panic(errors.Wrap(err, "converting value from string")) - } - - return &element{ - line: value, - i: i, - } -} - -func (v *TableVec) Get(i int) Element { - return v.s[i] -} - -func (v *TableVec) End() int { - return len(v.s) -} - -func (v *TableVec) insert(i int, value string) error { - v.s = append(v.s[:i], append([]Element{v.newElement(value)}, v.s[i:]...)...) - return nil -} - -func (v *TableVec) PushBack(value string) error { - v.s = append(v.s, v.newElement(value)) - return nil -} - -func (v *TableVec) Less(v1, v2 Element) bool { - return v1.Less(v2) -} - -func (v *TableVec) convertFromString(value string) (Element, error) { - return v.newElement(value), nil -} - -func (v *TableVec) Dump(filename string) error { - file, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644) - if err != nil { - return errors.Errorf("failed creating file: %s", err) - } - datawriter := bufio.NewWriter(file) - - for _, data := range v.s { - _, err = datawriter.WriteString(data.Value() + "\n") - if err != nil { - return errors.Errorf("failed writing file: %s", err) +// AllocateTableVector returns an allocation function that sorts the file on +// the pos element with the sep separator. +func AllocateTableVector(sep string, pos int) func(int) *Vector { + return func(size int) *Vector { + return &Vector{ + s: make([]Element, 0, size), + NewElement: func(value string) Element { + num := strings.Split(value, sep)[pos] + i, err := strconv.Atoi(num) + if err != nil { + panic(errors.Wrap(err, "converting value from string")) + } + + return &element{ + line: value, + i: i, + } + }, + Less: func(v1, v2 Element) bool { + // nolint:forcetypeassert // we already know the type. + return v1.(*element).i < v2.(*element).i + }, } } - datawriter.Flush() - file.Close() - return nil -} - -func (v *TableVec) FrontShift() { - v.s = v.s[1:] } diff --git a/vector/vector.go b/vector/vector.go index 7afd3bb..f25165f 100644 --- a/vector/vector.go +++ b/vector/vector.go @@ -1,47 +1,73 @@ +// Package vector contains the core operation for sorting. package vector import ( + "bufio" "fmt" + "os" "sort" "strings" + + "github.com/pkg/errors" ) -type Element interface { - Value() string - Less(other Element) bool +// Vector holds a slice of Elements for sorting. The NewElement is called each +// time a new item from the file is read or inserted into the slice. The Less +// function should return true if the first element is lower than the second. +type Vector struct { + NewElement func(value string) Element + Less func(v1, v2 Element) bool + s []Element +} + +// Get returns the element at the given index. +func (v *Vector) Get(i int) Element { + return v.s[i] +} + +// Len returns the length of the vector. +func (v *Vector) Len() int { + return len(v.s) } -type Vector interface { - // Get Access i-th element - Get(i int) Element - // PushBack Add item at the end - PushBack(value string) error - // Less Returns wether v1 is smaller than v2 - Less(v1, v2 Element) bool - // Dump Create a file and store the underluing data - Dump(filename string) error - // FrontShift Remove the first element - FrontShift() - // End Length of the Vector - End() int - // insert Insert elements at index i - insert(i int, value string) error - // convertFromString Convert the line from the file to the expected underlying data - convertFromString(value string) (Element, error) +// PushBack pushes a new element to the end of the vector. +func (v *Vector) PushBack(value string) { + v.s = append(v.s, v.NewElement(value)) +} + +func (v *Vector) Dump(filename string) error { + file, err := os.OpenFile(filename, os.O_CREATE|os.O_WRONLY, 0o644) + if err != nil { + return errors.Errorf("failed creating file: %s", err) + } + defer file.Close() + datawriter := bufio.NewWriter(file) + defer datawriter.Flush() + + for _, data := range v.s { + _, err = datawriter.WriteString(data.Value() + "\n") + if err != nil { + return errors.Errorf("failed writing file: %s", err) + } + } + return nil +} + +// FrontShift shifts the vector one element forward. +func (v *Vector) FrontShift() { + v.s = v.s[1:] } // Sort Perform a binary search to find where to put a value in a vector. Ascending order. -func Sort(ans Vector, line, sep string, pos int) error { +func Sort(v *Vector, line, sep string, pos int) error { num := strings.Split(line, sep) if len(num) < pos { return fmt.Errorf("could not find position %d in %q", pos, line) } - val, err := ans.convertFromString(num[pos]) - if err != nil { - return err - } - found := sort.Search(ans.End(), func(i int) bool { - return !ans.Less(ans.Get(i), val) + val := v.NewElement(num[pos]) + found := sort.Search(v.Len(), func(i int) bool { + return !v.Less(v.Get(i), val) }) - return ans.insert(found, line) + v.s = append(v.s[:found], append([]Element{v.NewElement(line)}, v.s[found:]...)...) + return nil } From f03eadd33efd450f2ef0274913d20f6e1cf63c0c Mon Sep 17 00:00:00 2001 From: Arsham Shirvani Date: Fri, 4 Feb 2022 21:01:19 +0000 Subject: [PATCH 3/3] ref(file): Info accepts separator and position --- file/file.go | 12 ++++++------ main.go | 5 +++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/file/file.go b/file/file.go index 2ebdabf..36d06a0 100644 --- a/file/file.go +++ b/file/file.go @@ -2,19 +2,19 @@ package file import ( "bufio" - "io" "path" "strconv" "github.com/askiada/external-sort/vector" - "github.com/pkg/errors" ) type Info struct { - Reader io.Reader - Allocate func(int) *vector.Vector + Separator string + Pos int + Reader io.Reader + Allocate func(int) *vector.Vector } // Sort Perform a naive sort of a reader and put the results in ascending order in a Vector. @@ -23,7 +23,7 @@ func (f *Info) Sort(file io.Reader) error { scanner := bufio.NewScanner(file) for scanner.Scan() { text := scanner.Text() - err := vector.Sort(ans, text, "\t", 0) + err := vector.Sort(ans, text, f.Separator, f.Pos) if err != nil { return errors.Wrap(err, "sorting file") } @@ -64,7 +64,7 @@ func (f *Info) CreateSortedChunks(chunkFolder string, dumpSize int) ([]string, e ans = f.Allocate(dumpSize) } text := scanner.Text() - err := vector.Sort(ans, text, "\t", 0) + err := vector.Sort(ans, text, f.Separator, f.Pos) if err != nil { return nil, errors.Wrap(err, fn) } diff --git a/main.go b/main.go index 4cc5d3b..d2d4402 100644 --- a/main.go +++ b/main.go @@ -30,8 +30,9 @@ func main() { } defer f.Close() fI := &file.Info{ - Reader: f, - Allocate: vector.AllocateTableVector("\t", 0), + Reader: f, + Allocate: vector.AllocateTableVector("\t", 0), + Separator: "\t", } // create small files with maximum 30 rows in each