【NLP_关系抽取】BiGRU-Dual Attention模型训练、评估与使用
import numpy as npimport os# embedding the positiondef pos_embed(x):    if x < -60:        return 0    if -60 <= x <= 60:        return x + 61    if x > 60:        return 122# find the index of x in y, if x not in y, return -1def find_index(x, y):    flag = -1    for i in range(len(y)):        if x != y[i]:            continue        else:            return i    return flag# reading datadef init():    print('reading word embedding data...')    vec = []    word2id = {}    f = open('./origin_data/token_vec_100.txt', encoding='utf-8')    content = f.readline()    content = content.strip().split()    dim = int(content[1])    while True:        content = f.readline()        if content == '':            break        content = content.strip().split()        word2id[content[0]] = len(word2id)        content = content[1:]        content = [(float)(i) for i in content]        vec.append(content)    f.close()    word2id['UNK'] = len(word2id)    word2id['BLANK'] = len(word2id)    vec.append(np.random.normal(size=dim, loc=0, scale=0.05))    vec.append(np.random.normal(size=dim, loc=0, scale=0.05))    vec = np.array(vec, dtype=np.float32)    print('reading relation to id')    relation2id = {}    f = open('./origin_data/ywp_relation2id.txt', 'r', encoding='utf-8')    while True:        content = f.readline()        if content == '':            break        content = content.strip().split()        relation2id[content[0]] = int(content[1])    f.close()    # length of sentence is 70    fixlen = 70    # max length of position embedding is 60 (-60~+60)    maxlen = 60    train_sen = {}  # {entity pair:[[[label1-sentence 1],[label1-sentence 2]...],[[label2-sentence 1],[label2-sentence 2]...]}    train_ans = {}  # {entity pair:[label1,label2,...]} the label is one-hot vector    print('reading train data...')    f = open('./origin_data/ywp_train.txt', 'r', encoding='utf-8')    while True:        content = f.readline()        if content == '':            break        content = content.strip().split()        # get entity name        en1 = content[0]        en2 = content[1]        relation = 0        if content[2] not in relation2id:            relation = relation2id['NA']        else:            relation = relation2id[content[2]]        # put the same entity pair sentences into a dict        tup = (en1, en2)        label_tag = 0        if tup not in train_sen:            train_sen[tup] = []            train_sen[tup].append([])            y_id = relation            label_tag = 0            label = [0 for i in range(len(relation2id))]            label[y_id] = 1            train_ans[tup] = []            train_ans[tup].append(label)        else:            y_id = relation            label_tag = 0            label = [0 for i in range(len(relation2id))]            label[y_id] = 1            temp = find_index(label, train_ans[tup])            if temp == -1:                train_ans[tup].append(label)                label_tag = len(train_ans[tup]) - 1                train_sen[tup].append([])            else:                label_tag = temp        sentence = content[3]        en1pos = 0        en2pos = 0        # For Chinese        en1pos = sentence.find(en1)        if en1pos == -1:            en1pos = 0        en2pos = sentence.find(en2)        if en2pos == -1:            en2pos = 0        output = []        # Embeding the position        for i in range(fixlen):            word = word2id['BLANK']            rel_e1 = pos_embed(i - en1pos)            rel_e2 = pos_embed(i - en2pos)            output.append([word, rel_e1, rel_e2])        for i in range(min(fixlen, len(sentence))):            word = 0            if sentence[i] not in word2id:                word = word2id['UNK']            else:                word = word2id[sentence[i]]            output[i][0] = word        train_sen[tup][label_tag].append(output)    print('reading test data ...')    test_sen = {}  # {entity pair:[[sentence 1],[sentence 2]...]}    test_ans = {}  # {entity pair:[labels,...]} the labels is N-hot vector (N is the number of multi-label)    f = open('./origin_data/ywp_test.txt', 'r', encoding='utf-8')    while True:        content = f.readline()        if content == '':            break        content = content.strip().split()        en1 = content[0]        en2 = content[1]        relation = 0        if content[2] not in relation2id:            relation = relation2id['NA']        else:            relation = relation2id[content[2]]        tup = (en1, en2)        if tup not in test_sen:            test_sen[tup] = []            y_id = relation            label_tag = 0            label = [0 for i in range(len(relation2id))]            label[y_id] = 1            test_ans[tup] = label        else:            y_id = relation            test_ans[tup][y_id] = 1        sentence = content[3]        en1pos = 0        en2pos = 0        # For Chinese        en1pos = sentence.find(en1)        if en1pos == -1:            en1pos = 0        en2pos = sentence.find(en2)        if en2pos == -1:            en2pos = 0        output = []        for i in range(fixlen):            word = word2id['BLANK']            rel_e1 = pos_embed(i - en1pos)            rel_e2 = pos_embed(i - en2pos)            output.append([word, rel_e1, rel_e2])        for i in range(min(fixlen, len(sentence))):            word = 0            if sentence[i] not in word2id:                word = word2id['UNK']            else:                word = word2id[sentence[i]]            output[i][0] = word        test_sen[tup].append(output)    train_x = []    train_y = []    test_x = []    test_y = []    if not os.path.exists("data"):        os.makedirs("data")    print('organizing train data')    f = open('./data/ywp_train_q&a.txt', 'w', encoding='utf-8')    temp = 0    for i in train_sen:        if len(train_ans[i]) != len(train_sen[i]):            print('ERROR')        lenth = len(train_ans[i])        for j in range(lenth):            train_x.append(train_sen[i][j])            train_y.append(train_ans[i][j])            f.write(str(temp) + '\t' + i[0] + '\t' + i[1] + '\t' + str(np.argmax(train_ans[i][j])) + '\n')            temp += 1    f.close()    print('organizing test data')    f = open('./data/ywp_test_q&a.txt', 'w', encoding='utf-8')    temp = 0    for i in test_sen:        test_x.append(test_sen[i])        test_y.append(test_ans[i])        tempstr = ''        for j in range(len(test_ans[i])):            if test_ans[i][j] != 0:                tempstr = tempstr + str(j) + '\t'        f.write(str(temp) + '\t' + i[0] + '\t' + i[1] + '\t' + tempstr + '\n')        temp += 1    f.close()    train_x = np.array(train_x)    train_y = np.array(train_y)    test_x = np.array(test_x)    test_y = np.array(test_y)    np.save('./data/vec.npy', vec)    np.save('./data/train_x.npy', train_x)    np.save('./data/train_y.npy', train_y)    np.save('./data/testall_x.npy', test_x)    np.save('./data/testall_y.npy', test_y)def seperate():    print('reading training data')    x_train = np.load('./data/train_x.npy')    train_word = []    train_pos1 = []    train_pos2 = []    print('seprating train data')    for i in range(len(x_train)):        word = []        pos1 = []        pos2 = []        for j in x_train[i]:            temp_word = []            temp_pos1 = []            temp_pos2 = []            for k in j:                temp_word.append(k[0])                temp_pos1.append(k[1])                temp_pos2.append(k[2])            word.append(temp_word)            pos1.append(temp_pos1)            pos2.append(temp_pos2)        train_word.append(word)        train_pos1.append(pos1)        train_pos2.append(pos2)    train_word = np.array(train_word)    train_pos1 = np.array(train_pos1)    train_pos2 = np.array(train_pos2)    np.save('./data/train_word.npy', train_word)    np.save('./data/train_pos1.npy', train_pos1)    np.save('./data/train_pos2.npy', train_pos2)    print('seperating test all data')    x_test = np.load('./data/testall_x.npy')    test_word = []    test_pos1 = []    test_pos2 = []    for i in range(len(x_test)):        word = []        pos1 = []        pos2 = []        for j in x_test[i]:            temp_word = []            temp_pos1 = []            temp_pos2 = []            for k in j:                temp_word.append(k[0])                temp_pos1.append(k[1])                temp_pos2.append(k[2])            word.append(temp_word)            pos1.append(temp_pos1)            pos2.append(temp_pos2)        test_word.append(word)        test_pos1.append(pos1)        test_pos2.append(pos2)    test_word = np.array(test_word)    test_pos1 = np.array(test_pos1)    test_pos2 = np.array(test_pos2)    np.save('./data/testall_word.npy', test_word)    np.save('./data/testall_pos1.npy', test_pos1)    np.save('./data/testall_pos2.npy', test_pos2)# get answer metric for PR curve evaluationdef getans():    test_y = np.load('./data/testall_y.npy')    eval_y = []    for i in test_y:        eval_y.append(i[1:])    allans = np.reshape(eval_y, (-1))    np.save('./data/allans.npy', allans)def get_metadata():    fwrite = open('./data/metadata.tsv', 'w', encoding='utf-8')    f = open('./origin_data/token_vec_100.txt', encoding='utf-8')    f.readline()    while True:        content = f.readline().strip()        if content == '':            break        name = content.split()[0]        fwrite.write(name + '\n')    f.close()    fwrite.close()init()seperate()getans()get_metadata()




而我定义的向量维度并非12,使用原模型运行,报错“Assign requires shapes of both tensors to match”(如下图),即输入的向量维度与模型中的不同。


for one_epoch in range(1000):
import tensorflow as tfimport numpy as npimport timeimport datetimeimport osimport networkfrom tensorflow.contrib.tensorboard.plugins import projectorFLAGS = tf.app.flags.FLAGStf.app.flags.DEFINE_string('summary_dir', '.', 'path to store summary')def main(_):    # the path to save models    save_path = './model/'    print('reading wordembedding')    wordembedding = np.load('./data/vec.npy')    print('reading training data')    train_y = np.load('./data/train_y.npy')    train_word = np.load('./data/train_word.npy')    train_pos1 = np.load('./data/train_pos1.npy')    train_pos2 = np.load('./data/train_pos2.npy')    settings = network.Settings()    settings.vocab_size = len(wordembedding)    print("train_y[0]",len(train_y[0]))    settings.num_classes = len(train_y[0])    big_num = settings.big_num    with tf.Graph().as_default():        sess = tf.Session()        with sess.as_default():            initializer = tf.contrib.layers.xavier_initializer()            with tf.variable_scope("model", reuse=None, initializer=initializer):                m = network.GRU(is_training=True, word_embeddings=wordembedding, settings=settings)            global_step = tf.Variable(0, name="global_step", trainable=False)            optimizer = tf.train.AdamOptimizer(0.0005)            train_op = optimizer.minimize(m.final_loss, global_step=global_step)            sess.run(tf.global_variables_initializer())            saver = tf.train.Saver(max_to_keep=None)            merged_summary = tf.summary.merge_all()            summary_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train_loss', sess.graph)            def train_step(word_batch, pos1_batch, pos2_batch, y_batch, big_num):                feed_dict = {}                total_shape = []                total_num = 0                total_word = []                total_pos1 = []                total_pos2 = []                print("len(word_batch)",len(word_batch))                for i in range(len(word_batch)):                    total_shape.append(total_num)                    total_num += len(word_batch[i])                    for word in word_batch[i]:                        total_word.append(word)                    for pos1 in pos1_batch[i]:                        total_pos1.append(pos1)                    for pos2 in pos2_batch[i]:                        total_pos2.append(pos2)                total_shape.append(total_num)                total_shape = np.array(total_shape)                total_word = np.array(total_word)                total_pos1 = np.array(total_pos1)                total_pos2 = np.array(total_pos2)                feed_dict[m.total_shape] = total_shape                feed_dict[m.input_word] = total_word                feed_dict[m.input_pos1] = total_pos1                feed_dict[m.input_pos2] = total_pos2                feed_dict[m.input_y] = y_batch                temp, step, loss, accuracy, summary, l2_loss, final_loss = sess.run(                    [train_op, global_step, m.total_loss, m.accuracy, merged_summary, m.l2_loss, m.final_loss],                    feed_dict)                time_str = datetime.datetime.now().isoformat()                accuracy = np.reshape(np.array(accuracy), (big_num))                acc = np.mean(accuracy)                summary_writer.add_summary(summary, step)                if step % 1 == 0:                    tempstr = "{}: step {}, softmax_loss {:g}, acc {:g}".format(time_str, step, loss, acc)                    print(tempstr)            print("settings.num_epochs",settings.num_epochs)            for one_epoch in range(1000):                temp_order = list(range(len(train_word)))                np.random.shuffle(temp_order)                #print("int(len(temp_order) / float(settings.big_num",len(temp_order) / float(settings.big_num))                for i in range(int(len(temp_order) / float(settings.big_num))):                    temp_word = []                    temp_pos1 = []                    temp_pos2 = []                    temp_y = []                    temp_input = temp_order[i * settings.big_num:(i + 1) * settings.big_num]                    for k in temp_input:                        temp_word.append(train_word[k])                        temp_pos1.append(train_pos1[k])                        temp_pos2.append(train_pos2[k])                        temp_y.append(train_y[k])                    num = 0                    for single_word in temp_word:                        num += len(single_word)                    if num > 1500:                        print('out of range')                        continue                    temp_word = np.array(temp_word)                    temp_pos1 = np.array(temp_pos1)                    temp_pos2 = np.array(temp_pos2)                    temp_y = np.array(temp_y)                    train_step(temp_word, temp_pos1, temp_pos2, temp_y, settings.big_num)                    current_step = tf.train.global_step(sess, global_step)                    print("current_step",current_step)            print('saving model')            path = saver.save(sess, save_path + 'ATT_GRU_model')            tempstr = 'have saved model to ' + path            print(tempstr)if __name__ == "__main__":    tf.app.run()





infile = open(                'D:\\Asian elephant\\biye\\Spatial relation extraction\\Information-Extraction-Chinese-master\\RE_BGRU_2ATT\\origin_data\\ywp_use2.txt',                encoding='utf-8')            for orgline in infile:                print(orgline)                en1, en2, sentence = orgline.split()                ywpoutfile = open(                    'D:\\Asian elephant\\biye\\Spatial relation extraction\\Information-Extraction-Chinese-master\\RE_BGRU_2ATT\\origin_data\\ywp_use_result2.txt',                    'a')                ywpoutfile.write('\n' + en1 + ' ' + en2 + '\n')


prob, accuracy = test_step(test_word, test_pos1, test_pos2, test_y)                prob = np.reshape(np.array(prob), (1, test_settings.num_classes))[0]                top3_id = prob.argsort()[-3:][::-1]                for n, rel_id in enumerate(top3_id):                    ywpoutfile.write("No." + str(n + 1) + ": " + id2relation[rel_id] + ", Probability is " + str(                        prob[rel_id]) + '\n')



from pprint import pprintimport tensorflow as tfimport numpy as npimport timeimport datetimeimport osimport networkfrom sklearn.metrics import average_precision_scoreFLAGS = tf.app.flags.FLAGSimport warningswarnings.filterwarnings(action='ignore')# embedding the positiondef pos_embed(x):    if x < -60:        return 0    if -60 <= x <= 60:        return x + 61    if x > 60:        return 122def main_for_evaluation():    pathname = "./model/ATT_GRU_model"    wordembedding = np.load('./data/vec.npy')    test_settings = network.Settings()    test_settings.vocab_size = 16693    test_settings.num_classes = 8    test_settings.big_num = 5561    big_num_test = test_settings.big_num    with tf.Graph().as_default():        sess = tf.Session()        with sess.as_default():            def test_step(word_batch, pos1_batch, pos2_batch, y_batch):                feed_dict = {}                total_shape = []                total_num = 0                total_word = []                total_pos1 = []                total_pos2 = []                for i in range(len(word_batch)):                    total_shape.append(total_num)                    total_num += len(word_batch[i])                    for word in word_batch[i]:                        total_word.append(word)                    for pos1 in pos1_batch[i]:                        total_pos1.append(pos1)                    for pos2 in pos2_batch[i]:                        total_pos2.append(pos2)                total_shape.append(total_num)                total_shape = np.array(total_shape)                total_word = np.array(total_word)                total_pos1 = np.array(total_pos1)                total_pos2 = np.array(total_pos2)                feed_dict[mtest.total_shape] = total_shape                feed_dict[mtest.input_word] = total_word                feed_dict[mtest.input_pos1] = total_pos1                feed_dict[mtest.input_pos2] = total_pos2                feed_dict[mtest.input_y] = y_batch                loss, accuracy, prob = sess.run(                    [mtest.loss, mtest.accuracy, mtest.prob], feed_dict)                return prob, accuracy            with tf.variable_scope("model"):                mtest = network.GRU(is_training=False, word_embeddings=wordembedding, settings=test_settings)            names_to_vars = {v.op.name: v for v in tf.global_variables()}            saver = tf.train.Saver(names_to_vars)            # testlist = range(1000, 1800, 100)            testlist = [9000]            for model_iter in testlist:                # for compatibility purposes only, name key changes from tf 0.x to 1.x, compat_layer                saver.restore(sess, pathname + str(model_iter))                time_str = datetime.datetime.now().isoformat()                print(time_str)                print('Evaluating all test data and save data for PR curve')                test_y = np.load('./data/testall_y.npy')                test_word = np.load('./data/testall_word.npy')                test_pos1 = np.load('./data/testall_pos1.npy')                test_pos2 = np.load('./data/testall_pos2.npy')                allprob = []                acc = []                for i in range(int(len(test_word) / float(test_settings.big_num))):                    prob, accuracy = test_step(test_word[i * test_settings.big_num:(i + 1) * test_settings.big_num],                                               test_pos1[i * test_settings.big_num:(i + 1) * test_settings.big_num],                                               test_pos2[i * test_settings.big_num:(i + 1) * test_settings.big_num],                                               test_y[i * test_settings.big_num:(i + 1) * test_settings.big_num])                    acc.append(np.mean(np.reshape(np.array(accuracy), (test_settings.big_num))))                    prob = np.reshape(np.array(prob), (test_settings.big_num, test_settings.num_classes))                    for single_prob in prob:                        allprob.append(single_prob[1:])                allprob = np.reshape(np.array(allprob), (-1))                order = np.argsort(-allprob)                print('saving all test result...')                current_step = model_iter                np.save('./out/allprob_iter_' + str(current_step) + '.npy', allprob)                allans = np.load('./data/allans.npy')                # caculate the pr curve area                average_precision = average_precision_score(allans, allprob)                print('PR curve area:' + str(average_precision))def main(_):    # If you retrain the model, please remember to change the path to your own model below:    pathname = "./model/ATT_GRU_model"    wordembedding = np.load('./data/vec.npy')    test_settings = network.Settings()    test_settings.vocab_size = 16693    test_settings.num_classes = 8    test_settings.big_num = 1    with tf.Graph().as_default():        sess = tf.Session()        with sess.as_default():            def test_step(word_batch, pos1_batch, pos2_batch, y_batch):                feed_dict = {}                total_shape = []                total_num = 0                total_word = []                total_pos1 = []                total_pos2 = []                for i in range(len(word_batch)):                    total_shape.append(total_num)                    total_num += len(word_batch[i])                    for word in word_batch[i]:                        total_word.append(word)                    for pos1 in pos1_batch[i]:                        total_pos1.append(pos1)                    for pos2 in pos2_batch[i]:                        total_pos2.append(pos2)                total_shape.append(total_num)                total_shape = np.array(total_shape)                total_word = np.array(total_word)                total_pos1 = np.array(total_pos1)                total_pos2 = np.array(total_pos2)                feed_dict[mtest.total_shape] = total_shape                feed_dict[mtest.input_word] = total_word                feed_dict[mtest.input_pos1] = total_pos1                feed_dict[mtest.input_pos2] = total_pos2                feed_dict[mtest.input_y] = y_batch                loss, accuracy, prob = sess.run(                    [mtest.loss, mtest.accuracy, mtest.prob], feed_dict)                return prob, accuracy            with tf.variable_scope("model"):                mtest = network.GRU(is_training=False, word_embeddings=wordembedding, settings=test_settings)            names_to_vars = {v.op.name: v for v in tf.global_variables()}            saver = tf.train.Saver(names_to_vars)            saver.restore(sess, pathname)            print('reading word embedding data...')            vec = []            word2id = {}            f = open('./origin_data/token_vec_100.txt', encoding='utf-8')            content = f.readline()            content = content.strip().split()            dim = int(content[1])            while True:                content = f.readline()                if content == '':                    break                content = content.strip().split()                word2id[content[0]] = len(word2id)                content = content[1:]                content = [(float)(i) for i in content]                vec.append(content)            f.close()            word2id['UNK'] = len(word2id)            word2id['BLANK'] = len(word2id)            print('reading relation to id')            relation2id = {}            id2relation = {}            f = open(                'D:\Asian elephant\毕业\空间关系抽取\所用:可运行 BiGPU关系抽取模型\Information-Extraction-Chinese-master\RE_BGRU_2ATT\origin_data\\ywp_relation2id.txt.',                'r', encoding='utf-8')            while True:                content = f.readline()                if content == '':                    break                content = content.strip().split()                relation2id[content[0]] = int(content[1])                id2relation[int(content[1])] = content[0]            f.close()            infile = open(                '…….txt',                encoding='utf-8')            for orgline in infile:                print(orgline)                en1, en2, sentence = orgline.split()                ywpoutfile = open(                     '…….txt',                    'a')                ywpoutfile.write('\n' + en1 + ' ' + en2 + '\n')                # print(sentence)                relation = 0                en1pos = sentence.find(en1)                if en1pos == -1:                    en1pos = 0                en2pos = sentence.find(en2)                if en2pos == -1:                    en2post = 0                output = []                # length of sentence is 70                fixlen = 70                # max length of position embedding is 60 (-60~+60)                maxlen = 60                # Encoding test x                for i in range(fixlen):                    word = word2id['BLANK']                    rel_e1 = pos_embed(i - en1pos)                    rel_e2 = pos_embed(i - en2pos)                    output.append([word, rel_e1, rel_e2])                for i in range(min(fixlen, len(sentence))):                    word = 0                    if sentence[i] not in word2id:                        # print(sentence[i])                        # print('==')                        word = word2id['UNK']                        # print(word)                    else:                        # print(sentence[i])                        # print('||')                        word = word2id[sentence[i]]                        # print(word)                    output[i][0] = word                test_x = []                test_x.append([output])                # Encoding test y                label = [0 for i in range(len(relation2id))]                label[0] = 1                test_y = []                test_y.append(label)                test_x = np.array(test_x)                test_y = np.array(test_y)                test_word = []                test_pos1 = []                test_pos2 = []                for i in range(len(test_x)):                    word = []                    pos1 = []                    pos2 = []                    for j in test_x[i]:                        temp_word = []                        temp_pos1 = []                        temp_pos2 = []                        for k in j:                            temp_word.append(k[0])                            temp_pos1.append(k[1])                            temp_pos2.append(k[2])                        word.append(temp_word)                        pos1.append(temp_pos1)                        pos2.append(temp_pos2)                    test_word.append(word)                    test_pos1.append(pos1)                    test_pos2.append(pos2)                test_word = np.array(test_word)                test_pos1 = np.array(test_pos1)                test_pos2 = np.array(test_pos2)                # print("test_word Matrix:")                # print(test_word)                # print("test_pos1 Matrix:")                # print(test_pos1)                # print("test_pos2 Matrix:")                # print(test_pos2)                prob, accuracy = test_step(test_word, test_pos1, test_pos2, test_y)                prob = np.reshape(np.array(prob), (1, test_settings.num_classes))[0]                # print("关系是:")                # print(prob)                top3_id = prob.argsort()[-3:][::-1]                for n, rel_id in enumerate(top3_id):                    ywpoutfile.write("No." + str(n + 1) + ": " + id2relation[rel_id] + ", Probability is " + str(                        prob[rel_id]) + '\n')                # except Exception as e:                #    print(e)                # result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag)                # print(result)if __name__ == "__main__":    tf.app.run()















