upidea · January 25, 2019 02:11
diff --git a/shuffleofnumpy.py b/shuffleofnumpy.py
 # 直接用numpy做数据打散、划分
 data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)


 labels = to_categorical(np.asarray(labels))
 print('Shape of Data Tensor:', data.shape)
 print('Shape of Label Tensor:', labels.shape)

 indices = np.arange(data.shape[0])
 np.random.shuffle(indices)
 data = data[indices]
 labels = labels[indices]
 nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

 x_train = data[:-nb_validation_samples]
 y_train = labels[:-nb_validation_samples]
 x_val = data[-nb_validation_samples:]
 y_val = labels[-nb_validation_samples:]


 # 采用 keras的smaple函数来做数据划分
 val_data=train_data.sample(frac=0.2,random_state=200)
 train_data=train_data.drop(val_data.index)


 # 将数据中的某列数据(这里的author)做LabelEncode
 authors=train_data.author.unique()
 dic={}
 for i,author in enumerate(authors):
    dic[author]=i
 labels=train_data.author.apply(lambda x:dic[x])


 # https://www.kaggle.com/marijakekic/cnn-in-keras-with-pretrained-word2vec-weights
 # 加载word2vec的预训练数据
 import gensim
 from gensim.models import Word2Vec
 from gensim.utils import simple_preprocess

 from gensim.models.keyedvectors import KeyedVectors

 word_vectors = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True)

 EMBEDDING_DIM=300
 vocabulary_size=min(len(word_index)+1,NUM_WORDS)
 embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
 for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

 del(word_vectors)

 from keras.layers import Embedding
 embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)
	# 直接用numpy做数据打散、划分
	data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)


	labels = to_categorical(np.asarray(labels))
	print('Shape of Data Tensor:', data.shape)
	print('Shape of Label Tensor:', labels.shape)

	indices = np.arange(data.shape[0])
	np.random.shuffle(indices)
	data = data[indices]
	labels = labels[indices]
	nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

	x_train = data[:-nb_validation_samples]
	y_train = labels[:-nb_validation_samples]
	x_val = data[-nb_validation_samples:]
	y_val = labels[-nb_validation_samples:]


	# 采用 keras的smaple函数来做数据划分
	val_data=train_data.sample(frac=0.2,random_state=200)
	train_data=train_data.drop(val_data.index)


	# 将数据中的某列数据(这里的author)做LabelEncode
	authors=train_data.author.unique()
	dic={}
	for i,author in enumerate(authors):
	dic[author]=i
	labels=train_data.author.apply(lambda x:dic[x])


	# https://www.kaggle.com/marijakekic/cnn-in-keras-with-pretrained-word2vec-weights
	# 加载word2vec的预训练数据
	import gensim
	from gensim.models import Word2Vec
	from gensim.utils import simple_preprocess

	from gensim.models.keyedvectors import KeyedVectors

	word_vectors = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True)

	EMBEDDING_DIM=300
	vocabulary_size=min(len(word_index)+1,NUM_WORDS)
	embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
	for word, i in word_index.items():
	if i>=NUM_WORDS:
	continue
	try:
	embedding_vector = word_vectors[word]
	embedding_matrix[i] = embedding_vector
	except KeyError:
	embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

	del(word_vectors)

	from keras.layers import Embedding
	embedding_layer = Embedding(vocabulary_size,
	EMBEDDING_DIM,
	weights=[embedding_matrix],
	trainable=True)
No results found