Movatterモバイル変換


[0]ホーム

URL:



Code forHow to Perform Text Classification in Python using Tensorflow 2 and Keras Tutorial


View on Github

parameters.py

from tensorflow.keras.layers import LSTM# max number of words in each sentenceSEQUENCE_LENGTH = 300# N-Dimensional GloVe embedding vectorsEMBEDDING_SIZE = 300# number of words to use, discarding the restN_WORDS = 10000# out of vocabulary tokenOOV_TOKEN = None# 30% testing set, 70% training setTEST_SIZE = 0.3# number of CELL layersN_LAYERS = 1# the RNN cell to use, LSTM in this caseRNN_CELL = LSTM# whether it's a bidirectional RNNIS_BIDIRECTIONAL = False# number of units (RNN_CELL ,nodes) in each layerUNITS = 128# dropout rateDROPOUT = 0.4### Training parametersLOSS = "categorical_crossentropy"OPTIMIZER = "adam"BATCH_SIZE = 64EPOCHS = 6def get_model_name(dataset_name):    # construct the unique model name    model_name = f"{dataset_name}-{RNN_CELL.__name__}-seq-{SEQUENCE_LENGTH}-em-{EMBEDDING_SIZE}-w-{N_WORDS}-layers-{N_LAYERS}-units-{UNITS}-opt-{OPTIMIZER}-BS-{BATCH_SIZE}-d-{DROPOUT}"    if IS_BIDIRECTIONAL:        # add 'bid' str if bidirectional        model_name = "bid-" + model_name    if OOV_TOKEN:        # add 'oov' str if OOV token is specified        model_name += "-oov"    return model_name

utils.py

from tqdm import tqdmimport numpy as npfrom tensorflow.keras.preprocessing.sequence import pad_sequencesfrom tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectionalfrom tensorflow.keras.models import Sequentialfrom tensorflow.keras.preprocessing.text import Tokenizerfrom tensorflow.keras.preprocessing.sequence import pad_sequencesfrom tensorflow.keras.utils import to_categoricalfrom sklearn.model_selection import train_test_splitfrom sklearn.datasets import fetch_20newsgroupsfrom glob import globimport randomdef get_embedding_vectors(word_index, embedding_size=100):    embedding_matrix = np.zeros((len(word_index) + 1, embedding_size))    with open(f"data/glove.6B.{embedding_size}d.txt", encoding="utf8") as f:        for line in tqdm(f, "Reading GloVe"):            values = line.split()            # get the word as the first word in the line            word = values[0]            if word in word_index:                idx = word_index[word]                # get the vectors as the remaining values in the line                embedding_matrix[idx] = np.array(values[1:], dtype="float32")    return embedding_matrixdef create_model(word_index, units=128, n_layers=1, cell=LSTM, bidirectional=False,                embedding_size=100, sequence_length=100, dropout=0.3,                 loss="categorical_crossentropy", optimizer="adam",                 output_length=2):    """    Constructs a RNN model given its parameters    """    embedding_matrix = get_embedding_vectors(word_index, embedding_size)    model = Sequential()    # add the embedding layer    model.add(Embedding(len(word_index) + 1,              embedding_size,              weights=[embedding_matrix],              trainable=False,              input_length=sequence_length))    for i in range(n_layers):        if i == n_layers - 1:            # last layer            if bidirectional:                model.add(Bidirectional(cell(units, return_sequences=False)))            else:                model.add(cell(units, return_sequences=False))        else:            # first layer or hidden layers            if bidirectional:                model.add(Bidirectional(cell(units, return_sequences=True)))            else:                model.add(cell(units, return_sequences=True))        model.add(Dropout(dropout))    model.add(Dense(output_length, activation="softmax"))    # compile the model    model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])    return model    def load_imdb_data(num_words, sequence_length, test_size=0.25, oov_token=None):    # read reviews    reviews = []    with open("data/reviews.txt") as f:        for review in f:            review = review.strip()            reviews.append(review)    labels = []    with open("data/labels.txt") as f:        for label in f:            label = label.strip()            labels.append(label)    # tokenize the dataset corpus, delete uncommon words such as names, etc.    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)    tokenizer.fit_on_texts(reviews)    X = tokenizer.texts_to_sequences(reviews)        X, y = np.array(X), np.array(labels)    # pad sequences with 0's    X = pad_sequences(X, maxlen=sequence_length)    # convert labels to one-hot encoded    y = to_categorical(y)    # split data to training and testing sets    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1)    data = {}    data["X_train"] = X_train    data["X_test"]= X_test    data["y_train"] = y_train    data["y_test"] = y_test    data["tokenizer"] = tokenizer    data["int2label"] =  {0: "negative", 1: "positive"}    data["label2int"] = {"negative": 0, "positive": 1}        return datadef load_20_newsgroup_data(num_words, sequence_length, test_size=0.25, oov_token=None):    # load the 20 news groups dataset    # shuffling the data & removing each document's header, signature blocks and quotation blocks    dataset = fetch_20newsgroups(subset="all", shuffle=True, remove=("headers", "footers", "quotes"))    documents = dataset.data    labels = dataset.target    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)    tokenizer.fit_on_texts(documents)    X = tokenizer.texts_to_sequences(documents)        X, y = np.array(X), np.array(labels)    # pad sequences with 0's    X = pad_sequences(X, maxlen=sequence_length)    # convert labels to one-hot encoded    y = to_categorical(y)    # split data to training and testing sets    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1)    data = {}    data["X_train"] = X_train    data["X_test"]= X_test    data["y_train"] = y_train    data["y_test"] = y_test    data["tokenizer"] = tokenizer    data["int2label"] = { i: label for i, label in enumerate(dataset.target_names) }    data["label2int"] = { label: i for i, label in enumerate(dataset.target_names) }        return data

sentiment_analysis.py

from tensorflow.keras.callbacks import TensorBoardimport osfrom parameters import *from utils import create_model, load_imdb_data# create these folders if they does not existif not os.path.isdir("results"):    os.mkdir("results")if not os.path.isdir("logs"):    os.mkdir("logs")if not os.path.isdir("data"):    os.mkdir("data")# dataset name, IMDB movie reviews datasetdataset_name = "imdb"# get the unique model name based on hyper parameters on parameters.pymodel_name = get_model_name(dataset_name)# load the datadata = load_imdb_data(N_WORDS, SEQUENCE_LENGTH, TEST_SIZE, oov_token=OOV_TOKEN)model = create_model(data["tokenizer"].word_index, units=UNITS, n_layers=N_LAYERS,                     cell=RNN_CELL, bidirectional=IS_BIDIRECTIONAL, embedding_size=EMBEDDING_SIZE,                     sequence_length=SEQUENCE_LENGTH, dropout=DROPOUT,                     loss=LOSS, optimizer=OPTIMIZER, output_length=data["y_train"][0].shape[0])model.summary()tensorboard = TensorBoard(log_dir=os.path.join("logs", model_name))history = model.fit(data["X_train"], data["y_train"],                    batch_size=BATCH_SIZE,                    epochs=EPOCHS,                    validation_data=(data["X_test"], data["y_test"]),                    callbacks=[tensorboard],                    verbose=1)model.save(os.path.join("results", model_name) + ".h5")

20_news_group_classification.py

from tensorflow.keras.callbacks import TensorBoardimport osfrom parameters import *from utils import create_model, load_20_newsgroup_data# create these folders if they does not existif not os.path.isdir("results"):    os.mkdir("results")if not os.path.isdir("logs"):    os.mkdir("logs")if not os.path.isdir("data"):    os.mkdir("data")# dataset name, IMDB movie reviews datasetdataset_name = "20_news_group"# get the unique model name based on hyper parameters on parameters.pymodel_name = get_model_name(dataset_name)# load the datadata = load_20_newsgroup_data(N_WORDS, SEQUENCE_LENGTH, TEST_SIZE, oov_token=OOV_TOKEN)model = create_model(data["tokenizer"].word_index, units=UNITS, n_layers=N_LAYERS,                     cell=RNN_CELL, bidirectional=IS_BIDIRECTIONAL, embedding_size=EMBEDDING_SIZE,                     sequence_length=SEQUENCE_LENGTH, dropout=DROPOUT,                     loss=LOSS, optimizer=OPTIMIZER, output_length=data["y_train"][0].shape[0])model.summary()tensorboard = TensorBoard(log_dir=os.path.join("logs", model_name))history = model.fit(data["X_train"], data["y_train"],                    batch_size=BATCH_SIZE,                    epochs=EPOCHS,                    validation_data=(data["X_test"], data["y_test"]),                    callbacks=[tensorboard],                    verbose=1)model.save(os.path.join("results", model_name) + ".h5")

test.py

from tensorflow.keras.preprocessing.sequence import pad_sequencesimport numpy as npfrom parameters import *from utils import create_model, load_20_newsgroup_data, load_imdb_dataimport pickleimport os# dataset name, IMDB movie reviews datasetdataset_name = "imdb"# get the unique model name based on hyper parameters on parameters.pymodel_name = get_model_name(dataset_name)# data = load_20_newsgroup_data(N_WORDS, SEQUENCE_LENGTH, TEST_SIZE, oov_token=OOV_TOKEN)data = load_imdb_data(N_WORDS, SEQUENCE_LENGTH, TEST_SIZE, oov_token=OOV_TOKEN)model = create_model(data["tokenizer"].word_index, units=UNITS, n_layers=N_LAYERS,                     cell=RNN_CELL, bidirectional=IS_BIDIRECTIONAL, embedding_size=EMBEDDING_SIZE,                     sequence_length=SEQUENCE_LENGTH, dropout=DROPOUT,                     loss=LOSS, optimizer=OPTIMIZER, output_length=data["y_train"][0].shape[0])model.load_weights(os.path.join("results", f"{model_name}.h5"))def get_predictions(text):    sequence = data["tokenizer"].texts_to_sequences([text])    # pad the sequences    sequence = pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)    # get the prediction    prediction = model.predict(sequence)[0]    print("output vector:", prediction)    return data["int2label"][np.argmax(prediction)]while True:    text = input("Enter your text: ")    prediction = get_predictions(text)    print("="*50)    print("The class is:", prediction)

Ethical Hacking with Python EBook - Topic - Top


Join 50,000+ Python Programmers & Enthusiasts like you!



Tags

Ethical Hacking with Python EBook - Home - Middle


New Tutorials

Popular Tutorials


Practical Python PDF Processing EBook - Tutorials - Bottom







[8]ページ先頭

©2009-2025 Movatter.jp