Skip to content

Instantly share code, notes, and snippets.

@upidea
Last active January 25, 2019 02:11
Show Gist options
  • Select an option

  • Save upidea/3265fe15086ca91ae71060c6980a0c40 to your computer and use it in GitHub Desktop.

Select an option

Save upidea/3265fe15086ca91ae71060c6980a0c40 to your computer and use it in GitHub Desktop.
# 直接用numpy做数据打散、划分
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
print('Shape of Data Tensor:', data.shape)
print('Shape of Label Tensor:', labels.shape)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
# 采用 keras的smaple函数来做数据划分
val_data=train_data.sample(frac=0.2,random_state=200)
train_data=train_data.drop(val_data.index)
# 将数据中的某列数据(这里的author)做LabelEncode
authors=train_data.author.unique()
dic={}
for i,author in enumerate(authors):
dic[author]=i
labels=train_data.author.apply(lambda x:dic[x])
# https://www.kaggle.com/marijakekic/cnn-in-keras-with-pretrained-word2vec-weights
# 加载word2vec的预训练数据
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True)
EMBEDDING_DIM=300
vocabulary_size=min(len(word_index)+1,NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
if i>=NUM_WORDS:
continue
try:
embedding_vector = word_vectors[word]
embedding_matrix[i] = embedding_vector
except KeyError:
embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)
del(word_vectors)
from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
EMBEDDING_DIM,
weights=[embedding_matrix],
trainable=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment