1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
| import collections import math import os import random import zipfile import numpy as np import urllib import tensorflow as tf import matplotlib.pyplot as plt from sklearn.manifold import TSNE
def read_data(filename): """读取zip的第一个文件并且分割单词为字符串数组""" with zipfile.ZipFile(filename) as f: data = tf.compat.as_str(f.read(f.namelist()[0])).split() return data
words = read_data('text8.zip')
vocabulary_size = 50000
def build_dataset(words): """在字典第一个位置插入一项“UNK"代表不能识别的单词,也就是未出现在字典的单词统一用UNK表示""" count = [['UNK', -1]] count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) dictionary = dict() for word, _ in count: dictionary[word] = len(dictionary) data = list() unk_count = 0 for word in words: if word in dictionary: index = dictionary[word] else: index = 0 unk_count += 1 data.append(index) count[0][1] = unk_count reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words)
data_index = 0
def generate_batch(batch_size, num_skips, skip_window): global data_index assert batch_size % num_skips == 0 assert num_skips <= 2 * skip_window batch = np.ndarray(shape=(batch_size), dtype=np.int32) labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) span = 2 * skip_window + 1 buffer = collections.deque(maxlen=span) for _ in range(span): buffer.append(data[data_index]) data_index = (data_index + 1) % len(data) for i in range(batch_size // num_skips): target = skip_window targets_to_avoid = [ skip_window ] source_word = buffer[skip_window] for j in range(num_skips): while target in targets_to_avoid: target = random.randint(0, span - 1) targets_to_avoid.append(target) batch[i * num_skips + j] = source_word labels[i * num_skips + j, 0] = buffer[target] buffer.append(data[data_index]) data_index = (data_index + 1) % len(data) return batch, labels
batch_size = 128 embedding_size = 128 skip_window = 1 num_skips = 2
valid_size = 16 valid_window = 100 valid_examples = np.random.choice(valid_window, valid_size, replace=False) num_sampled = 64
graph = tf.Graph() with graph.as_default(): train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
with tf.device('/cpu:0'): embeddings = tf.Variable( tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) emded = tf.nn.embedding_lookup(embeddings, train_inputs) nce_weights = tf.Variable( tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size))) nce_biases = tf.Variable(tf.zeros([vocabulary_size])) loss = tf.reduce_mean( tf.nn.nce_loss(weights = nce_weights, biases = nce_biases, labels = train_labels, inputs = emded, num_sampled = num_sampled, num_classes = vocabulary_size )) optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) normalized_embeddings = embeddings / norm valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True) init = tf.global_variables_initializer()
num_steps = 100001 with tf.Session(graph=graph) as session: init.run() print("Initialized") average_loss = 0 for step in range(num_steps): batch_inputs, batch_labels = generate_batch( batch_size, num_skips, skip_window) feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels} _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict) average_loss += loss_val if step % 2000 == 0: if step > 0: average_loss /= 2000 print("Average loss at step", step, ": ", average_loss) average_loss = 0 if step % 10000 == 0: sim = similarity.eval() for i in range(valid_size): valid_word = reverse_dictionary[valid_examples[i]] top_k = 8 nearest = (-sim[i, : ]).argsort()[1:top_k+1] log_str = "与 %s 最接近的词是:" % valid_word for k in range(top_k): close_word = reverse_dictionary[nearest[k]] log_str = "%s %s," % (log_str, close_word) print(log_str) final_embeddings = normalized_embeddings.eval()
|