Code forHow to Perform Text Classification in Python using Tensorflow 2 and Keras Tutorial

parameters.py

from tensorflow.keras.layers import LSTM# max number of words in each sentenceSEQUENCE_LENGTH = 300# N-Dimensional GloVe embedding vectorsEMBEDDING_SIZE = 300# number of words to use, discarding the restN_WORDS = 10000# out of vocabulary tokenOOV_TOKEN = None# 30% testing set, 70% training setTEST_SIZE = 0.3# number of CELL layersN_LAYERS = 1# the RNN cell to use, LSTM in this caseRNN_CELL = LSTM# whether it's a bidirectional RNNIS_BIDIRECTIONAL = False# number of units (RNN_CELL ,nodes) in each layerUNITS = 128# dropout rateDROPOUT = 0.4### Training parametersLOSS = "categorical_crossentropy"OPTIMIZER = "adam"BATCH_SIZE = 64EPOCHS = 6def get_model_name(dataset_name):    # construct the unique model name    model_name = f"{dataset_name}-{RNN_CELL.__name__}-seq-{SEQUENCE_LENGTH}-em-{EMBEDDING_SIZE}-w-{N_WORDS}-layers-{N_LAYERS}-units-{UNITS}-opt-{OPTIMIZER}-BS-{BATCH_SIZE}-d-{DROPOUT}"    if IS_BIDIRECTIONAL:        # add 'bid' str if bidirectional        model_name = "bid-" + model_name    if OOV_TOKEN:        # add 'oov' str if OOV token is specified        model_name += "-oov"    return model_name

utils.py

from tqdm import tqdmimport numpy as npfrom tensorflow.keras.preprocessing.sequence import pad_sequencesfrom tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectionalfrom tensorflow.keras.models import Sequentialfrom tensorflow.keras.preprocessing.text import Tokenizerfrom tensorflow.keras.preprocessing.sequence import pad_sequencesfrom tensorflow.keras.utils import to_categoricalfrom sklearn.model_selection import train_test_splitfrom sklearn.datasets import fetch_20newsgroupsfrom glob import globimport randomdef get_embedding_vectors(word_index, embedding_size=100):    embedding_matrix = np.zeros((len(word_index) + 1, embedding_size))    with open(f"data/glove.6B.{embedding_size}d.txt", encoding="utf8") as f:        for line in tqdm(f, "Reading GloVe"):            values = line.split()            # get the word as the first word in the line            word = values[0]            if word in word_index:                idx = word_index[word]                # get the vectors as the remaining values in the line                embedding_matrix[idx] = np.array(values[1:], dtype="float32")    return embedding_matrixdef create_model(word_index, units=128, n_layers=1, cell=LSTM, bidirectional=False,                embedding_size=100, sequence_length=100, dropout=0.3,                 loss="categorical_crossentropy", optimizer="adam",                 output_length=2):    """    Constructs a RNN model given its parameters    """    embedding_matrix = get_embedding_vectors(word_index, embedding_size)    model = Sequential()    # add the embedding layer    model.add(Embedding(len(word_index) + 1,              embedding_size,              weights=[embedding_matrix],              trainable=False,              input_length=sequence_length))    for i in range(n_layers):        if i == n_layers - 1:            # last layer            if bidirectional:                model.add(Bidirectional(cell(units, return_sequences=False)))            else:                model.add(cell(units, return_sequences=False))        else:            # first layer or hidden layers            if bidirectional:                model.add(Bidirectional(cell(units, return_sequences=True)))            else:                model.add(cell(units, return_sequences=True))        model.add(Dropout(dropout))    model.add(Dense(output_length, activation="softmax"))    # compile the model    model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])    return model    def load_imdb_data(num_words, sequence_length, test_size=0.25, oov_token=None):    # read reviews    reviews = []    with open("data/reviews.txt") as f:        for review in f:            review = review.strip()            reviews.append(review)    labels = []    with open("data/labels.txt") as f:        for label in f:            label = label.strip()            labels.append(label)    # tokenize the dataset corpus, delete uncommon words such as names, etc.    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)    tokenizer.fit_on_texts(reviews)    X = tokenizer.texts_to_sequences(reviews)        X, y = np.array(X), np.array(labels)    # pad sequences with 0's    X = pad_sequences(X, maxlen=sequence_length)    # convert labels to one-hot encoded    y = to_categorical(y)    # split data to training and testing sets    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1)    data = {}    data["X_train"] = X_train    data["X_test"]= X_test    data["y_train"] = y_train    data["y_test"] = y_test    data["tokenizer"] = tokenizer    data["int2label"] =  {0: "negative", 1: "positive"}    data["label2int"] = {"negative": 0, "positive": 1}        return datadef load_20_newsgroup_data(num_words, sequence_length, test_size=0.25, oov_token=None):    # load the 20 news groups dataset    # shuffling the data & removing each document's header, signature blocks and quotation blocks    dataset = fetch_20newsgroups(subset="all", shuffle=True, remove=("headers", "footers", "quotes"))    documents = dataset.data    labels = dataset.target    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)    tokenizer.fit_on_texts(documents)    X = tokenizer.texts_to_sequences(documents)        X, y = np.array(X), np.array(labels)    # pad sequences with 0's    X = pad_sequences(X, maxlen=sequence_length)    # convert labels to one-hot encoded    y = to_categorical(y)    # split data to training and testing sets    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1)    data = {}    data["X_train"] = X_train    data["X_test"]= X_test    data["y_train"] = y_train    data["y_test"] = y_test    data["tokenizer"] = tokenizer    data["int2label"] = { i: label for i, label in enumerate(dataset.target_names) }    data["label2int"] = { label: i for i, label in enumerate(dataset.target_names) }        return data

sentiment_analysis.py

from tensorflow.keras.callbacks import TensorBoardimport osfrom parameters import *from utils import create_model, load_imdb_data# create these folders if they does not existif not os.path.isdir("results"):    os.mkdir("results")if not os.path.isdir("logs"):    os.mkdir("logs")if not os.path.isdir("data"):    os.mkdir("data")# dataset name, IMDB movie reviews datasetdataset_name = "imdb"# get the unique model name based on hyper parameters on parameters.pymodel_name = get_model_name(dataset_name)# load the datadata = load_imdb_data(N_WORDS, SEQUENCE_LENGTH, TEST_SIZE, oov_token=OOV_TOKEN)model = create_model(data["tokenizer"].word_index, units=UNITS, n_layers=N_LAYERS,                     cell=RNN_CELL, bidirectional=IS_BIDIRECTIONAL, embedding_size=EMBEDDING_SIZE,                     sequence_length=SEQUENCE_LENGTH, dropout=DROPOUT,                     loss=LOSS, optimizer=OPTIMIZER, output_length=data["y_train"][0].shape[0])model.summary()tensorboard = TensorBoard(log_dir=os.path.join("logs", model_name))history = model.fit(data["X_train"], data["y_train"],                    batch_size=BATCH_SIZE,                    epochs=EPOCHS,                    validation_data=(data["X_test"], data["y_test"]),                    callbacks=[tensorboard],                    verbose=1)model.save(os.path.join("results", model_name) + ".h5")

20_news_group_classification.py

from tensorflow.keras.callbacks import TensorBoardimport osfrom parameters import *from utils import create_model, load_20_newsgroup_data# create these folders if they does not existif not os.path.isdir("results"):    os.mkdir("results")if not os.path.isdir("logs"):    os.mkdir("logs")if not os.path.isdir("data"):    os.mkdir("data")# dataset name, IMDB movie reviews datasetdataset_name = "20_news_group"# get the unique model name based on hyper parameters on parameters.pymodel_name = get_model_name(dataset_name)# load the datadata = load_20_newsgroup_data(N_WORDS, SEQUENCE_LENGTH, TEST_SIZE, oov_token=OOV_TOKEN)model = create_model(data["tokenizer"].word_index, units=UNITS, n_layers=N_LAYERS,                     cell=RNN_CELL, bidirectional=IS_BIDIRECTIONAL, embedding_size=EMBEDDING_SIZE,                     sequence_length=SEQUENCE_LENGTH, dropout=DROPOUT,                     loss=LOSS, optimizer=OPTIMIZER, output_length=data["y_train"][0].shape[0])model.summary()tensorboard = TensorBoard(log_dir=os.path.join("logs", model_name))history = model.fit(data["X_train"], data["y_train"],                    batch_size=BATCH_SIZE,                    epochs=EPOCHS,                    validation_data=(data["X_test"], data["y_test"]),                    callbacks=[tensorboard],                    verbose=1)model.save(os.path.join("results", model_name) + ".h5")

test.py

from tensorflow.keras.preprocessing.sequence import pad_sequencesimport numpy as npfrom parameters import *from utils import create_model, load_20_newsgroup_data, load_imdb_dataimport pickleimport os# dataset name, IMDB movie reviews datasetdataset_name = "imdb"# get the unique model name based on hyper parameters on parameters.pymodel_name = get_model_name(dataset_name)# data = load_20_newsgroup_data(N_WORDS, SEQUENCE_LENGTH, TEST_SIZE, oov_token=OOV_TOKEN)data = load_imdb_data(N_WORDS, SEQUENCE_LENGTH, TEST_SIZE, oov_token=OOV_TOKEN)model = create_model(data["tokenizer"].word_index, units=UNITS, n_layers=N_LAYERS,                     cell=RNN_CELL, bidirectional=IS_BIDIRECTIONAL, embedding_size=EMBEDDING_SIZE,                     sequence_length=SEQUENCE_LENGTH, dropout=DROPOUT,                     loss=LOSS, optimizer=OPTIMIZER, output_length=data["y_train"][0].shape[0])model.load_weights(os.path.join("results", f"{model_name}.h5"))def get_predictions(text):    sequence = data["tokenizer"].texts_to_sequences([text])    # pad the sequences    sequence = pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)    # get the prediction    prediction = model.predict(sequence)[0]    print("output vector:", prediction)    return data["int2label"][np.argmax(prediction)]while True:    text = input("Enter your text: ")    prediction = get_predictions(text)    print("="*50)    print("The class is:", prediction)

Ethical Hacking with Python EBook - Topic - Top

New Tutorials

Building a Full-Stack RAG Chatbot with FastAPI, OpenAI, and Streamlit

How to Recover Deleted Files with Python

How to Use Python to Track Google Search Results and Reviews Over Time

YouTube Video Transcription Summarization with Python

Getting Started with Python for SaaS Applications

Movatterモバイル変換

Code forHow to Perform Text Classification in Python using Tensorflow 2 and Keras Tutorial

Tags

New Tutorials

Popular Tutorials