Spaces:
Running
Running
| from functions_preprocess import LinguisticPreprocessor, fit_model, training_data | |
| from datasets import load_dataset | |
| import pandas as pd | |
| from sklearn.linear_model import SGDClassifier | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| def main(): | |
| #####load dataset | |
| dataset_1 = load_dataset("rotten_tomatoes", download_mode="force_redownload") | |
| dataset_2 = load_dataset('sst2') | |
| dataset_2 = dataset_2.rename_column('sentence', 'text') | |
| dataset_3 = load_dataset('imdb') | |
| X_train, y_train, X_test, y_test = training_data(dataset_1, dataset_2, dataset_3) | |
| pipeline = Pipeline( | |
| steps=[ | |
| ("processor", LinguisticPreprocessor()), | |
| ("vectorizer", TfidfVectorizer(ngram_range=(1, 2))), | |
| ("model", SGDClassifier(loss="log_loss", n_jobs = -1, alpha=0.000001, penalty= 'elasticnet'))]) | |
| ####### fit model and save the results | |
| fit_model(pipeline, X_train, y_train, X_test, y_test) | |
| predictions = pipeline.predict(X_test) | |
| # Create a DataFrame with index and predictions | |
| results_df = pd.DataFrame({ | |
| "index": range(len(predictions)), | |
| "pred": predictions}) | |
| # Save the DataFrame to a CSV file | |
| results_df.to_csv("results.csv", index=False) | |
| if __name__ == "__main__": | |
| main() | |
| # model_pkl_file = "sentiment_model.pkl" | |
| # | |
| # with open(model_pkl_file, 'wb') as file: | |
| # pickle.dump(pipeline, file) | |