-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdraft.py
More file actions
51 lines (48 loc) · 1.44 KB
/
Copy pathdraft.py
File metadata and controls
51 lines (48 loc) · 1.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#coding=utf-8
import collections
import numpy as np
import tensorflow as tf
import codecs
import os
#数据读取与预处理
poetry_file ='data/draft.txt'
# 诗集
poetrys = []
with codecs.open(poetry_file, "r",'utf-8') as f:
for line in f:
try:
title, content = line.strip().split(':')
content = content.replace(' ','')
if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content:
continue
if len(content) < 5 or len(content) > 79:
continue
content = '[' + content + ']'
poetrys.append(content)
except Exception as e:
print (e)
# 按诗的字数排序,从少到多
poetrys = sorted(poetrys,key=lambda line: len(line))
print("peotrys:",poetrys)
print(u'唐诗总数: ', len(poetrys))#3w多首诗
# 统计每个字出现次数
all_words = []
for poetry in poetrys:
all_words += [word for word in poetry]
print("all_words:",all_words)
counter = collections.Counter(all_words)
print ("counter:",counter)
count_pairs = sorted(counter.items(), key=lambda x: -x[1])
print("counter_pairs:",count_pairs)
words, _ = zip(*count_pairs)
print("words:",words)
#add empty char
words = words + (" ",)
# map word to id
# 每个字映射为一个数字ID
word2idmap = dict(zip(words,range(len(words))))
print("word2idmap:",word2idmap)
# 把诗转换为向量形式
word2idfunc = lambda word: word2idmap.get(word,len(words))
peorty_vecs = [list(map(word2idfunc,peotry)) for peotry in poetrys]
print(peorty_vecs)