Spaces:
Running
Running
| import string | |
| import pandas as pd | |
| from bs4 import BeautifulSoup | |
| import re | |
| from torchtext.vocab import build_vocab_from_iterator, GloVe | |
| import numpy as np | |
| from sklearn.base import TransformerMixin | |
| from sklearn.metrics import ConfusionMatrixDisplay | |
| from keras.preprocessing.text import Tokenizer | |
| import nltk | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import wordnet | |
| import torch | |
| import torch.nn as nn | |
| import torch.optim as optim | |
| import torch.nn.functional as F | |
| from torchtext.data.utils import get_tokenizer | |
| stop_words = set(stopwords.words('english')) | |
| lemmatizer = WordNetLemmatizer() | |
| def download_if_non_existent(res_path, res_name): | |
| try: | |
| nltk.data.find(res_path) | |
| except LookupError: | |
| print(f'resource {res_path} not found. Downloading now...') | |
| nltk.download(res_name) | |
| download_if_non_existent('corpora/stopwords', 'stopwords') | |
| download_if_non_existent('taggers/averaged_perceptron_tagger', 'averaged_perceptron_tagger') | |
| download_if_non_existent('corpora/wordnet', 'wordnet') | |
| def fit_model(pipeline, x_train, y_train, x_test, y_test): | |
| pipeline.fit(x_train, y_train) | |
| return ConfusionMatrixDisplay.from_estimator(pipeline, x_test, y_test, normalize="true") | |
| class LinguisticPreprocessor(TransformerMixin): | |
| def __init__(self, ): | |
| super().__init__() | |
| self.lemmatizer = WordNetLemmatizer() | |
| self.tokenizer = Tokenizer() | |
| self.stop_words = set(stopwords.words('english')) | |
| self.stop = stopwords.words('english') | |
| def fit(self, X, y=None): | |
| return self | |
| def transform(self, X, y=None): | |
| X = self._remove_html_tags(X) | |
| X = self._remove_all_punctuations(X) | |
| X = self._remove_double_spaces(X) | |
| X = self._lemmatize(X) | |
| X = self._remove_stopwords(X) | |
| return X | |
| def _remove_html_tags(self, X): | |
| X = list(map( lambda x: BeautifulSoup(x, 'html.parser').get_text(), X)) | |
| return X | |
| def _remove_all_punctuations(self, X): | |
| X = list( | |
| map( | |
| lambda text: re.sub('[%s]' % re.escape(string.punctuation), '', text), | |
| X | |
| ) | |
| ) | |
| return X | |
| def _remove_double_spaces(self, X): | |
| X = list(map(lambda text: re.sub(" +", " ", text), X)) | |
| return X | |
| def _remove_stopwords(self, X): | |
| X = list(map( | |
| lambda text: " ".join( | |
| [ | |
| word for word in text.split() if word not in (self.stop_words) | |
| ] | |
| ), | |
| X | |
| ) | |
| ) | |
| return X | |
| def _lemmatize(self, X): | |
| X = list(map(lambda text: self._lemmatize_one_sentence(text), X)) | |
| return X | |
| def _lemmatize_one_sentence(self, sentence): | |
| sentence = nltk.word_tokenize(sentence) | |
| sentence = list(map(lambda word: self.lemmatizer.lemmatize(word), sentence)) | |
| return " ".join(sentence) | |
| def training_data(dataset_1, dataset_2, dataset_3): | |
| X_test = dataset_1['test']['text'] | |
| y_test = dataset_1['test']['label'] | |
| test_df = pd.DataFrame({ | |
| 'text':X_test, | |
| 'label': y_test | |
| }) | |
| combined_train_df = pd.DataFrame({ | |
| 'text': dataset_1['train']['text'] + dataset_2['train']['text'] + dataset_3['train']['text'], | |
| 'label': dataset_1['train']['label'] + dataset_2['train']['label'] + dataset_3['train']['label'] | |
| }) | |
| combined_train_df.drop_duplicates(subset=['text'], inplace=True) | |
| merged_df = pd.merge(combined_train_df, test_df, on="text", how='left', indicator=True) | |
| result_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge']) | |
| X_train = result_df['text'].tolist() | |
| y_train = result_df['label_x'].tolist() | |
| X_test = np.array(X_test) | |
| X_train = np.array(X_train) | |
| return X_train, y_train, X_test, y_test | |
| class CNN(nn.Module): | |
| def __init__(self, vocab_size, embed_size, n_filters, filter_sizes, dropout, num_classes): | |
| super(CNN, self).__init__() | |
| self.embedding = nn.Embedding(vocab_size, embed_size) | |
| self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embed_size)) for fs in filter_sizes]) | |
| self.dropout = nn.Dropout(dropout) | |
| self.fc1 = nn.Linear(len(filter_sizes) * n_filters, num_classes) | |
| def forward(self, text): | |
| embedded = self.embedding(text) | |
| embedded = embedded.unsqueeze(1) | |
| conved = [F.leaky_relu(conv(embedded)).squeeze(3) for conv in self.convs] | |
| pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] | |
| cat = self.dropout(torch.cat(pooled, dim=1)) | |
| return self.fc1(cat) | |
| def build_vocab(data_iter): | |
| tokenizer = get_tokenizer("basic_english") | |
| def yield_tokens(): | |
| for example in data_iter: | |
| cleaned_text = clean_text(example['text']) | |
| yield tokenizer(cleaned_text) | |
| vocab = build_vocab_from_iterator(yield_tokens(), specials=["<unk>", "<pad>"]) | |
| vocab.set_default_index(vocab["<unk>"]) | |
| return vocab, tokenizer | |
| def clean_text(text): | |
| text = text.lower() | |
| text = re.sub(r'\d+', '', text) | |
| text = re.sub(r'[^\w\s]', '', text) | |
| text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]) | |
| return text | |