Last active
January 25, 2019 02:11
-
-
Save upidea/3265fe15086ca91ae71060c6980a0c40 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # 直接用numpy做数据打散、划分 | |
| data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) | |
| labels = to_categorical(np.asarray(labels)) | |
| print('Shape of Data Tensor:', data.shape) | |
| print('Shape of Label Tensor:', labels.shape) | |
| indices = np.arange(data.shape[0]) | |
| np.random.shuffle(indices) | |
| data = data[indices] | |
| labels = labels[indices] | |
| nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) | |
| x_train = data[:-nb_validation_samples] | |
| y_train = labels[:-nb_validation_samples] | |
| x_val = data[-nb_validation_samples:] | |
| y_val = labels[-nb_validation_samples:] | |
| # 采用 keras的smaple函数来做数据划分 | |
| val_data=train_data.sample(frac=0.2,random_state=200) | |
| train_data=train_data.drop(val_data.index) | |
| # 将数据中的某列数据(这里的author)做LabelEncode | |
| authors=train_data.author.unique() | |
| dic={} | |
| for i,author in enumerate(authors): | |
| dic[author]=i | |
| labels=train_data.author.apply(lambda x:dic[x]) | |
| # https://www.kaggle.com/marijakekic/cnn-in-keras-with-pretrained-word2vec-weights | |
| # 加载word2vec的预训练数据 | |
| import gensim | |
| from gensim.models import Word2Vec | |
| from gensim.utils import simple_preprocess | |
| from gensim.models.keyedvectors import KeyedVectors | |
| word_vectors = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True) | |
| EMBEDDING_DIM=300 | |
| vocabulary_size=min(len(word_index)+1,NUM_WORDS) | |
| embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM)) | |
| for word, i in word_index.items(): | |
| if i>=NUM_WORDS: | |
| continue | |
| try: | |
| embedding_vector = word_vectors[word] | |
| embedding_matrix[i] = embedding_vector | |
| except KeyError: | |
| embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM) | |
| del(word_vectors) | |
| from keras.layers import Embedding | |
| embedding_layer = Embedding(vocabulary_size, | |
| EMBEDDING_DIM, | |
| weights=[embedding_matrix], | |
| trainable=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment