本来是2020年的春夏学nlp的，但是疫情耽搁了，课本都在学校，所以只能先在网上学习一点了。
19年一直在搞cv，现在又转到nlp，感觉nlp会更难一点（感觉开了个大坑）
长期更新

一、NLP的过程

获取语料（获取数据集）
语料预处理（数据清洗）：清洗、分词、词性标注、去停用词（没有用的语气词，如的、了、么）
特征工程：把词语转成向量，方便计算机处理
- 一种方法就是word embedding（词嵌入），将字符转成数值向量。
特征选择（啥意思？）
模型训练
- 选择合适的机器学习模型
评价指标
NLP的任务
- 序列标注
- 分类任务
- 句子关系判断
- 生成式任务

二、NLP的分布假说

词语是nlp里最基本的单元，但与图像、音频不一样，词语是文字、字符，不是计算机能识别的数字
Harris提出了分布假说，上下文相似的词，其语义也相似
既然有词语有分布，那么就有分布模型
- 如
- 如统计模型
- 如神经网络模型
one hot vector可以将字符转成0-1的向量，但是这种方式，仅仅是将字符转成向量，还不够方便
- 即有$V$个词语，就形成$V$维向量，每个词语在某个维度为1，其余为0
- 如词语$x_1 , x_2$：$x_1 = [1, 0, \dots, 0]$，$x_2 = [0, 1, \dots, 0]$

三、词嵌入

三、代码学习

预处理

这里清洗数据时，要用到pandas的正则表达式，也可以将str(data.iloc[i])转成字符串，用re模块

import re
import pandas as pd
import logging
import os.path
import sys
import multiprocessing
 
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

# 处理数据
data = pd.read_table('news_sohusite_xml.smarty.dat')
dat = pd.DataFrame()

for i in range(data.size):
    if data.iloc[i].str.match('<content>').bool():
        if not data.iloc[i].str.match('<content></content>').bool():
            dat = dat.append( data.iloc[i] )

dat = dat.values

print(dat[0])

# 结巴分词
cut_list = []
for i in range(len(dat)):
	cut_list.append([' '.join(list(jieba.cut(str(dat[i])[11:-12], cut_all=False)))])

with open('file.txt', 'w') as fw: 
	for i in range(len(cut_list)): 
		fw.write(cut_list[i][0])
		fw.write('\n')
model = Word2Vec(LineSentence('file.txt'), size=50, window=5, min_count=5,
                     workers=multiprocessing.cpu_count())

model.save('file.model')
model.wv.save_word2vec_format('file.vector', binary=False)

上面的效果不好，所以又找了其他的代码学习，未来战士z和简书Spytensor，这两个到底是谁参考谁的啊==
数据长下面这样，由于是英文，每个单词就是一个词，所以不用分词。一共17005207个词语。

import collections
import math
import os
import random
import zipfile
import numpy as np
import urllib
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# 先下载了text8.zip文件
# Step 2 ： 解压文件
def read_data(filename):
    """读取zip的第一个文件并且分割单词为字符串数组"""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

words = read_data('text8.zip')


vocabulary_size = 50000

# Step 3 ： 准备数据集
def build_dataset(words):
    """在字典第一个位置插入一项“UNK"代表不能识别的单词，也就是未出现在字典的单词统一用UNK表示"""
    # 这里的count是，每个单词的词频，从头到尾，词频下降
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    # dictionary记录每个单词的在count里的index
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    # 记录原始数据，每个单词在count里的index
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0 # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary
# reverse_dictionary记录每个count里面的index的单词
data, count, dictionary, reverse_dictionary = build_dataset(words)

data_index = 0

# Step 4 : skip-gram

def generate_batch(batch_size, num_skips, skip_window):
    global data_index #global关键字 使data_index 可在其他函数中修改其值
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1 #span是文本序列的长度
    buffer = collections.deque(maxlen=span) #队列
    for _ in range(span):
        # data[data_index]记录的是第data_index个单词在count的行号
        # 如data[0]=5234，count[5234]=('anarchism', 303)
        buffer.append(data[data_index]) 
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips): # // 是整数除
        target = skip_window
        targets_to_avoid = [ skip_window ] # 现在的输入值，文本序列的中心词
        source_word = buffer[skip_window] # 中心词在count上的位置
        for j in range(num_skips): 
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target) # 已经经过的target放入targets_to_avoid
            batch[i * num_skips + j] = source_word
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

# Step 5 : 构建一个包含隐藏层的神经网络，隐藏层包含300节点，与我们要构造的WordEmbedding维度一致

batch_size = 128
embedding_size = 128 # Demension of the embedding vector
skip_window = 1      # How many words to consider left and right
num_skips = 2        # How many times to reuse an input to generate a label

valid_size = 16      # Random set of words to evaluate similarity on
valid_window = 100   # Only pick dev samples in the head of the distribution
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64     # Number of negative examples to sample

graph = tf.Graph()
with graph.as_default():
    # 定义输入输出
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    with tf.device('/cpu:0'):
        # 初始化embedding矩阵，后边经过多次训练后我们得到的结果就放在此embedding矩阵; 
        # tf.Variable是图变量，初始化至-1~1的均匀分布
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        # 将输入序列转换成embedding表示, [batch_size, embedding_size]
        # tf.nn.embedding_lookup的作用就是找到要寻找的embedding data中的对应的行下的vector
        emded = tf.nn.embedding_lookup(embeddings, train_inputs)
        # 初始化权重，此处使用负例采样NCE loss损失函数
        # 没懂，这个权重是干嘛的？？？
        nce_weights = tf.Variable( #50000x128
            tf.truncated_normal([vocabulary_size, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    loss =  tf.reduce_mean(
        tf.nn.nce_loss(weights = nce_weights,
                       biases = nce_biases,
                       labels = train_labels,
                       inputs = emded,
                       num_sampled = num_sampled,
                       num_classes = vocabulary_size ))          
        # 使用1.0的速率来构造SGD优化器
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        # 计算 minibatch 和 all embeddings的余弦相似度
        # tf.reduce_sum() 按照行的维度求和
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm # 归一化？
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
        # tf.matmul 矩阵相乘，16x128矩阵乘128x50000
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True) 
        # 添加变量初始化程序
    init = tf.global_variables_initializer()

# Step 6 : 开始训练
# 训练次数
num_steps = 100001
    # tf.Session 用于运行TensorFlow操作的类
with tf.Session(graph=graph) as session:
    # 我们必须在使用之前初始化所有变量
    init.run()
    print("Initialized")
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(
            batch_size, num_skips, skip_window)
        feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}
        # We perform one update step by evaluating the optimizer op( including it
        # in the list of returned values for session.run())
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            #The average loss is an estimate of the loss over the last 2000 batches.
            print("Average loss at step", step, ": ", average_loss)
            average_loss = 0
        # Note that this is expensive ( ~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, : ]).argsort()[1:top_k+1]
                log_str = "与 %s 最接近的词是:" % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

word2vec

4月22日，学习知乎、简书、CSDN，word2vec数学原理
word2vec就是词嵌入的一种方法。
输入一个one hot vector，输出一个one hot vector。
如下图，一个one hot vector有$V$维，隐藏层有$N$个结点
比如，输入$x_1 = [1, 0, \dots, 0]$，在隐藏层就有$x_1 \times W = w_{1} = [w_{11}, \dots, w_{iN}]$
那么，这个$w_1$就可以表示$x_1$，将$V$维的向量降维到了$N$维。这个就是word2vec的精髓

自己动手训练word2vec模型)