adds readme

corrects import of own functions that didn't work anymore because of a newer python version.
cleans and renames files
2023-08-30 21:45:38 +02:00 · 2023-08-30 21:45:27 +02:00 · 2023-08-30 21:18:55 +02:00 · 2023-08-16 10:06:16 +02:00 · 2023-08-15 14:51:28 +02:00 · 2023-08-15 14:30:40 +02:00
23 changed files with 8033 additions and 202 deletions
--- a/.vscode/.gitignore
+++ b/.vscode/.gitignore
@@ -0,0 +1 @@
 /settings.json
--- a/ClassificationFake.py
+++ b/ClassificationFake.py
@@ -0,0 +1,123 @@
 import numpy as np
 import pandas as pd
 from datetime import datetime
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 from datasets import load_dataset
 from transformers.pipelines.pt_utils import KeyDataset
 #%%
 # prepare & define paths
 # install xformers (pip install xformers) for better performance
 ###################
 # Setup directories
 # WD Michael
 wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 # datafile input directory
 di = "data/IN/"
 # Tweet-datafile output directory
 ud = "data/OUT/"
 # Name of file that all senator data will be written to
 senCSV = "Tweets-Classified-Topic-Results.csv"
 # Name of Classify datafile
 senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv"
 senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv"
 # don't change this one
 senCSVPath = wd + ud + senCSV
 senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
 senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
 import sys
 funs = wd+"funs"
 sys.path.insert(1, funs)
 import CleanTweets
 #%%
 # get datafra,e
 dfClassify = pd.read_csv(senCSVPath, dtype=(object))
 def encode_labels(label):
    if label == 'True':
        return 'False'
    elif label == 'False':
        return 'True'
    return 0
 dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
 dfClassify.to_csv("/home/michael/Documents/PS/Data/collectTweets/data/OUT/Tweets-Classified-Topic-Results.csv", encoding='utf-8')
 dfClassify = dfClassify[dfClassify['output_label_topicCov']=='True'] 
 # dataframe from csv
 dfClassify['fake'] = False
 #%%
 # https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
 # HowTo:
 # https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
 # https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
 pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
 model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
 tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
 # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
 dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
 #%%
 # remove empty rows
 dfClassify.cleanContent.replace('',np.nan,inplace=True)
 dfClassify.dropna(subset=['cleanContent'], inplace=True)
 #%%
 timeStart = datetime.now() # start counting execution time
 max_length = 128
 dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
 #train.rename(columns={'target': 'labels'}, inplace=True)
 #train.head()
 # %%
 dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
 #%%
 dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
 # %%from datetime import datetime
 #from tqdm.auto import tqdm
 #for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
 #    print(out)
 #%% 
 output_labels = []
 output_score = []
 for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
    output_labels.append(out['label'])
    output_score.append(out['score'])
    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
    # Exactly the same output as before, but the content are passed
    # as batches to the model
 # %%
 dfClassify['output_label_fake'] = output_labels
 dfClassify['output_score_fake'] = output_score
 timeEnd = datetime.now()
 timeTotal = timeEnd - timeStart
 timePerTweet = timeTotal / 96
 print(f"Total classification execution time: {timeTotal} seconds")
 print(f"Time per tweet classification: {timePerTweet}")
 # %%
 dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
 # %%
--- a/ClassificationTopic.py
+++ b/ClassificationTopic.py
@@ -1,12 +1,9 @@
 import re
 import string
 import numpy as np
 import pandas as pd
 from datetime import datetime
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 from datasets import load_dataset
 from transformers.pipelines.pt_utils import KeyDataset
 from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
 #%%
@@ -29,14 +26,20 @@ ud = "data/OUT/"
 senCSV = "SenatorsTweets-OnlyCov.csv"
 # Name of Classify datafile
-senCSVClassifiedPrep = "Tweets-Classified-Prep.csv"
+senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
-senCSVClassifiedResult = "Tweets-Classified-Results.csv"
+senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
 # don't change this one
 senCSVPath = wd + ud + senCSV
 senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
 senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
 import sys
 funs = wd+"funs"
 sys.path.insert(1, funs)
 import CleanTweets
 #%%
 # get datafra,e
 dfClassify = pd.read_csv(senCSVPath, dtype=(object))
@@ -50,17 +53,13 @@ dfClassify['fake'] = False
 # HowTo:
 # https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
 # https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
-pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
+pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
-model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
+model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
-tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
+tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
 # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
-dfClassify['cleanContent'] = dfClassify['rawContent'].apply(remove_URL)
+dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
 dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_emoji)
 dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_html)
 dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_punct)
 dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(lambda x: x.lower())
 #%%
 # remove empty rows
@@ -97,8 +96,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun
    # Exactly the same output as before, but the content are passed
    # as batches to the model
 # %%
-dfClassify['output_label'] = output_labels
+dfClassify['output_label_topicCov'] = output_labels
-dfClassify['output_score'] = output_score
+dfClassify['output_score_topicCov'] = output_score
 timeEnd = datetime.now()
 timeTotal = timeEnd - timeStart
@@ -111,3 +110,14 @@ print(f"Time per tweet classification: {timePerTweet}")
 dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
 # %%
 ## corrections
 def encode_labels(label):
    if label == 'real':
        return 'True'
    elif label == 'fake':
        return 'False'
    return 0
 dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
 dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
 #still wrong, will be corrected in ClassificationFake.py
--- a/README.md
+++ b/README.md
@@ -1,7 +1,127 @@
-# How to use
+# Requirements
-Execute collect.py to scrape tweets and generate the ´ALL-SENATORS-TWEETS.csv´.
+- python 3.10+
 - snscrape 0.6.2.20230321+ (see git repo in this folder)
 - transformers 4.31.0
 - numpy 1.23.5
 - pandas 2.0.3
 - scikit-learn 1.3.0
 - torch 2.0.1
-Execute collectSenData.py to scrape senator data and generate ´ALL-SENATORS.csv´.
+# About
-All new files will be written to ´data/OUT/´. Necessary data has to be located in ´data/IN/´
+This collection of scripts scrapes tweets of US-senators in the time from 2020-01-01T00:00:00Z to 2023-01-03T00:00:00Z, scrapes account data of the senators, prepares the tweets for the training of a NLP-model, trains two models to (1) classify the tweets topic as covid or non-covid and (2) the tweets as either "fake news" tweets or "non-fake news" tweets.
 Training only works with a prepared dataset in which the tweets are pre classified.
 More info in the comments of the scripts.
 Due to time constraints, most of the code is procedurally coded and ugly but effective.
 # How to
 Tested on Ubuntu 22.04. 
 If needed, the virual environment can be exported and send to you.
 All files in the folder data/in have to exist in order to execute the scripts.
 Execute in the following order:
 01 collect.py (see more for further info on scraping)
 02 collectSenData.py
 03 cleanTweets
 04 preTestClassification.py
 05 trainTopic.py
 06 trainFake.py
 07 ClassificationFake.py
 08 ClassificationTopic.py
 # Files & Folders
 ├── data
 │   ├── IN
 │   │   ├── counterKeywordsFinal.txt
 │   │   ├── counterKeywords.txt
 │   │   ├── keywords-raw.txt
 │   │   ├── keywords.txt
 │   │   ├── own_keywords.txt
 │   │   ├── pretest-tweets_fake.txt				contains tweet ids for pretest
 │   │   ├── pretest-tweets_not_fake.txt			contains tweet ids for pretest
 │   │   └── senators-raw.csv					senator datafile
 │   ├── OUT
 │   │   ├── ALL-SENATORS-TWEETS.csv
 │   │   ├── graphs
 │   │   │   ├── Timeline.png
 │   │   │   ├── Wordcloud-All.png
 │   │   │   └── Wordcloud-Cov.png
 │   │   ├── Pretest-Prep.csv
 │   │   ├── Pretest-Results.csv
 │   │   ├── Pretest-SENATORS-TWEETS.csv
 │   │   ├── profiles							dataset profiles
 │   │   │   ├── AllTweets.html
 │   │   │   └── CovTweets.html
 │   │   ├── SenatorsTweets-Final.csv
 │   │   ├── SenatorsTweets-OnlyCov.csv
 │   │   ├── SenatorsTweets-train-CovClassification.csv
 │   │   ├── SenatorsTweets-train-CovClassificationTRAIN.csv
 │   │   ├── SenatorsTweets-train-CovClassification.tsv
 │   │   ├── SenatorsTweets-train-FakeClassification.csv
 │   │   ├── SenatorsTweets-train-FakeClassificationTRAIN.csv
 │   │   ├── SenatorsTweets-train-FakeClassification.tsv
 │   │   ├── SenatorsTweets-Training.csv
 │   │   ├── SenatorsTweets-Training_WORKING-COPY.csv
 │   │   ├── topClass-PRETEST-Prep.csv
 │   │   ├── topClass-PRETEST-Results.csv
 │   │   ├── Tweets-All-slices.zip
 │   │   ├── Tweets-Classified-Fake-Prep.csv
 │   │   ├── Tweets-Classified-Fake-Results.csv
 │   │   ├── Tweets-Classified-Prep.csv
 │   │   ├── Tweets-Classified-Topic-Prep.csv
 │   │   ├── Tweets-Classified-Topic-Results.csv
 │   │   └── Tweets-Stub.csv
 ├── funs
 │   ├── CleanTweets.py					2023-01-03T00:00:00Z		multiple functions to clean tweet contents for NLN-processing
 │   ├── ClearDupes.py							function for deletion of duplicate keywords
 │   ├── __init__.py
 │   ├── Scrape.py								scraper functions to be used for multiprocessing
 │   └── TimeSlice.py							time slice script to slice the time span in 24 slices, speeds up scraping through multiprocessing
 ├── log											logs of the scraping process
 │   ├── log_2023-06-23_21-06-10_err.log
 │   ├── log_2023-06-23_21-06-10.log
 │   └── log_2023-06-23_21-06-10_missing.log
 ├── models
 │   ├── CovClass								Covid tweet classification model
 │   │   └── 2023-08-15_05-56-50
 │   │       ├── 2023-08-15_05-56-50.csv			training output
 │   │       ├── config.json
 │   │       ├── pytorch_model.bin
 │   │       ├── special_tokens_map.json
 │   │       ├── tokenizer_config.json
 │   │       ├── tokenizer.json
 │   │       └── vocab.txt
 │   └── FakeClass								Fake tweet classification model
 │       └── 2023-08-15_14-35-43
 │           ├── 2023-08-15_14-35-43.csv			training output
 │           ├── config.json
 │           ├── pytorch_model.bin
 │           ├── special_tokens_map.json
 │           ├── tokenizer_config.json
 │           ├── tokenizer.json
 │           └── vocab.txt
 ├── snscrape									contains snscrape 0.6.2.20230321+ git repo
 ├── ClassificationFake.py						classifies tweets as fake or non-fake, saves:
 │													Tweets-Classified-Fake-Prep.csv		- prepared training dataset
 │													Tweets-Classified-Fake-Results.csv	- Tweets-Classified-Topic-Results.csv with cov classification results
 ├── ClassificationTopic.py						classifies tweet topic, saves: 
 │													Tweets-Classified-Topic-Prep.csv 	- prepared training dataset
 │													Tweets-Classified-Topic-Results.csv	- SenatorsTweets-OnlyCov.csv with cov classification results
 ├── cleanTweets.py								Curates keywordlists 
 │												Merges senator and tweet datasets
 │												Creates multiple datasets:
 │													SenatorsTweets-Final.csv	- all tweets with keyword columns
 │													SenatorsTweets-OnlyCov.csv	- only covid tweets, filtered by keywordlist
 │													SenatorsTweets-Training.csv	- training dataset, containing ~1800 randomly selected tweets from SenatorsTweets-OnlyCov.csv
 ├── collect.py									scrapes tweets, saves to ALL-SENATORS-TWEETS.csv
 ├── collectSenData.py							scrapes senator account data, saves to ALL-SENATORS.csv
 ├── createGraphs.py								creates wordcloud & timeline graphs
 ├── preTestClassification.py					pretest script that uses bvrau/covid-twitter-bert-v2-struth to analyze 100 preclassified tweets
 ├── profiler.py									creates dataset profiles
 ├── README.md									readme
 ├── trainFake.py								training script for the fake tweet classification model
 └── trainTopic.py								training script for the tweet topic classification model
--- a/analyze.py
+++ b/analyze.py
@@ -1,129 +0,0 @@
 import re
 import string
 import numpy as np
 import pandas as pd
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 from datasets import load_dataset
 from transformers.pipelines.pt_utils import KeyDataset
 from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
 #%%
 # prepare
 # install xformers (pip install xformers) for better performance
 ###################
 # Setup directories
 # WD Michael
 wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 # datafile input directory
 di = "data/IN/"
 # Tweet-datafile output directory
 ud = "data/OUT/"
 # Name of file that all senator data will be written to
 senCSV = "ALL-SENATORS-TWEETS.csv"
 # Name of new datafile generated
 senCSVc = "Tweets-Stub.csv"
 # Name of pretest files
 preTestIDsFake = "pretest-tweets_fake.txt"
 preTestIDsNot = "pretest-tweets_not_fake.txt"
 # Name of pretest datafile
 senCSVPretest = "Pretest.csv"
 senCSVPretestPrep = "Pretest-Prep.csv"
 senCSVPretestResult = "Pretest-Results.csv"
 # don't change this one
 senCSVPath = wd + ud + senCSV
 senCSVcPath = wd + ud + senCSVc
 senCSVcPretestPath = wd + ud + senCSVPretest
 senCSVcPretestPrepPath = wd + ud + senCSVPretestPrep
 senCSVcPretestResultPath = wd + ud + senCSVPretestResult
 preTestIDsFakePath = wd + di + preTestIDsFake
 preTestIDsNotPath = wd + di + preTestIDsNot
 # List of IDs to select
 # Read the IDs from a file
 preTestIDsFakeL = []
 preTestIDsNotL  = []
 with open(preTestIDsFakePath, "r") as file:
    lines = file.readlines()
    for line in lines:
        tid = line.strip()  # Remove the newline character
        preTestIDsFakeL.append(tid)
 with open(preTestIDsNotPath, "r") as file:
    lines = file.readlines()
    for line in lines:
        tid = line.strip()  # Remove the newline character
        preTestIDsNotL.append(tid)
 # Select rows based on the IDs
 df = pd.read_csv(senCSVPath, dtype=(object))
 #%%
 # Create pretest dataframe
 dfPreTest = df[df['id'].isin(preTestIDsFakeL)].copy()
 dfPreTest['fake'] = True
 dfPreTest = pd.concat([dfPreTest, df[df['id'].isin(preTestIDsNotL)]], ignore_index=True)
 dfPreTest['fake'] = dfPreTest['fake'].fillna(False)
 #%%
 # https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
 # HowTo:
 # https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
 # https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
 pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
 model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
 tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
 # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
 dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
 dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
 dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html)
 dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct)
 dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower())
 #%%
 max_length = 128
 dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
 #train.rename(columns={'target': 'labels'}, inplace=True)
 #train.head()
 # %%
 dfPreTest.to_csv(senCSVcPretestPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
 #%%
 dataset = load_dataset("csv", data_files=senCSVcPretestPrepPath)
 # %%
 results = pipe(KeyDataset(dataset, "text"))
 # %%
 #from tqdm.auto import tqdm
 #for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
 #    print(out)
 #%% 
 output_labels = []
 output_score = []
 for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
    output_labels.append(out['label'])
    output_score.append(out['score'])
    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
    # Exactly the same output as before, but the content are passed
    # as batches to the model
 # %%
 dfPreTest['output_label'] = output_labels
 dfPreTest['output_score'] = output_score
 # %%
 dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8')
 # %%
--- a/cleanTweets.py
+++ b/cleanTweets.py
@@ -9,9 +9,12 @@ Created on Mon Jun 26 20:36:43 2023
 import pandas as pd
 # import pyreadstat
 import numpy as np
-from funs.ClearDupes import deDupe
+import sys
 # Seet for training dataset generation
 seed = 86431891
 ###################
 # Setup directories
 # WD Michael
@@ -34,17 +37,24 @@ senDataset = "senators-raw.csv"
 # Name of new datafile generated
 senCSVc = "SenatorsTweets-Final"
 senCSVcCov = "SenatorsTweets-OnlyCov"
 senCSVcTrain = "SenatorsTweets-Training"
 # don't change this one
 senCSVPath = wd + ud + senCSV
 senCSVcPath = wd + ud + senCSVc + ".csv"
 senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
 senCSVcTrainPath = wd + ud + senCSVcTrain + ".csv"
 senSAVcPath = wd + ud + senCSV + ".sav"
 senDTAcPath = wd + ud + senCSV + ".dta"
 senDatasetPath = wd + di + senDataset
 df = pd.read_csv(senCSVPath, dtype=(object))
 ## Import own functions
 funs = wd+"funs"
 sys.path.insert(1, funs)
 from ClearDupes import deDupe
 mixed_columns = df.columns[df.nunique() != len(df)]
 print(mixed_columns)
@@ -188,7 +198,6 @@ dfCov = dfAll[dfAll['contains_counterKeyword']==False]
 dfCov = dfCov[dfCov['contains_keyword']==True]
 dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
 #%%
 # create column with tweet length
@@ -211,3 +220,14 @@ dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
 # =========================
 # %%
 # Create training dataset
 np.random.seed(seed); 
 dfTrain = pd.dfCov(np.random.rand(1800))
 # %%
 # Create training dataset
 np.random.seed(seed); 
 dfTrain = dfCov.loc[np.random.choice(dfCov.index, 1800, replace=False)]
 dfTrain = dfTrain[['tid', 'date', 'rawContent']]
 dfTrain['topicCovid'] = True
 dfTrain['fake'] = False
 dfTrain.to_csv(senCSVcTrainPath, encoding='utf-8')
--- a/collect.py
+++ b/collect.py
@@ -66,7 +66,6 @@ which is the final output.
 import os
 import pandas as pd
 import glob
 import time
 import sys
 from datetime import datetime
 import concurrent.futures
@@ -149,10 +148,12 @@ tweetDFColumns = [
 ################## do NOT change anything below this line ###################
 #############################################################################
-## Import functions
+## Import own functions
-from funs.TimeSlice import *
+funs = wd+"funs"
-from funs.ClearDupes import deDupe
+sys.path.insert(1, funs)
-from funs.Scrape import scrapeTweets
+from TimeSlice import get_Tslices
 from ClearDupes import deDupe
 from Scrape import scrapeTweets
 ################### 
 # Create logfile & log all outputs
--- a/data/IN/counterKeywordsFinal.txt
+++ b/data/IN/counterKeywordsFinal.txt
@@ -0,0 +1,23 @@
 meth
 gun violence
 flu season
 vaping
 chd
 addiction
 indigenous women
 separating children
 tobacco
 e-cigarette
 muslim ban
 soleimani
 cocaine
 separating families
 muslim travel ban
 usmca trade deal
 shooting
 overdose
 separated children
 coronary heart disease
 gun-violence
 opioid
 flu-season
--- a/data/IN/keywords.txt
+++ b/data/IN/keywords.txt
@@ -18,44 +18,43 @@ socialdistancing
 wear a mask
 lockdown
 covd
-Coronavirus
+coronavirus
-Koronavirus
+koronavirus
-Corona
+corona
-CDC
+cdc
-Wuhancoronavirus
+wuhancoronavirus
-Wuhanlockdown
+wuhanlockdown
-Ncov
+ncov
-Wuhan
+wuhan
-N95
+n95
-Kungflu
+kungflu
-Epidemic
+epidemic
 outbreak
-Sinophobia
+sinophobia
 China
 covid-19
 corona virus
 covid
 covid19
 sars-cov-2
-COVIDー19
+covidー19
-COVD
+covd
 pandemic
 coronapocalypse
 canceleverything
-Coronials
+coronials
-SocialDistancingNow
+socialdistancingnow
-Social Distancing
+social distancing
-SocialDistancing
+socialdistancing
 panicbuy
 panic buy
 panicbuying
 panic buying
-14DayQuarantine
+14dayquarantine
-DuringMy14DayQuarantine
+duringmy14dayquarantine
 panic shop
 panic shopping
 panicshop
-InMyQuarantineSurvivalKit
+inmyquarantinesurvivalkit
 panic-buy
 panic-shop
 coronakindness
@@ -65,7 +64,7 @@ chinesevirus
 stayhomechallenge
 stay home challenge
 sflockdown
-DontBeASpreader
+dontbeaspreader
 lockdown
 lock down
 shelteringinplace
@@ -79,13 +78,13 @@ flatten the curve
 china virus
 chinavirus
 quarentinelife
-PPEshortage
+ppeshortage
 saferathome
 stayathome
 stay at home
 stay home
 stayhome
-GetMePPE
+getmeppe
 covidiot
 epitwitter
 pandemie
@@ -93,7 +92,7 @@ wear a mask
 wearamask
 kung flu
 covididiot
-COVID__19
+covid__19
 omicron
 variant
 vaccine
@@ -139,9 +138,7 @@ work from home
 workfromhome
 working from home
 workingfromhome
 ppe
 n95
 ppe
 n95
 covidiots
 covidiots
--- a/data/OUT/.gitignore
+++ b/data/OUT/.gitignore
@@ -0,0 +1,8 @@
 /ALL-SENATORS-TWEETS.csv
 /Pretest-Prep.csv
 /Pretest-Results.csv
 /Pretest-SENATORS-TWEETS.csv
 /SenatorsTweets-Final.csv
 /SenatorsTweets-OnlyCov.csv
 /Tweets-Classified-Prep.csv
 /Tweets-Stub.csv
--- a/data/OUT/graphs/.gitignore
+++ b/data/OUT/graphs/.gitignore
@@ -0,0 +1,3 @@
 /Timeline.png
 /Wordcloud-All.png
 /Wordcloud-Cov.png
--- a/data/OUT/profiles/AllTweets.html
+++ b/data/OUT/profiles/AllTweets.html
--- a/data/OUT/profiles/CovTweets.html
+++ b/data/OUT/profiles/CovTweets.html
--- a/funs/CleanTweets.py
+++ b/funs/CleanTweets.py
@@ -1,10 +1,20 @@
 import re
 import string
-def remove_URL(text):
+def preprocess_roberta(text): # https://huggingface.co/cardiffnlp/twitter-roberta-base-sep2022
-    url = re.compile(r'https?://\S+|www\.\S+')
+    preprocessed_text = []
-    return url.sub(r'', text)
+    for t in text.split():
        if len(t) > 1:
            t = '@user' if t[0] == '@' and t.count('@') == 1 else t
            t = 'http' if t.startswith('http') else t
        preprocessed_text.append(t)
    return ' '.join(preprocessed_text)
 def remove_URL(text):
    try: 
        url = re.compile(r'https?://\S+|www\.\S+')
    except: print(text)
    return url.sub(r'', text)
 def remove_emoji(text):
    emoji_pattern = re.compile(
@@ -19,21 +29,61 @@ def remove_emoji(text):
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
 def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)
 def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)
-def clean_all(text):
+def remove_nonascii(text):
-    if not isinstance(text, str):
+    return re.sub(r'[^\x00-\x7F]+', '', text)
-        text = str(text) # Convert non-string values to string
+
 def remove_spec(text):
    text = re.sub(r'&amp;?', r'and', text)
    text = re.sub(r'&lt;', r'<', text)
    return re.sub(r'&gt;', r'>', text)
 def remove_spaces(text): # also new line chars and to lower case
    text = re.sub(r'&lt;', r'<', text)
    text = " ".join(text.splitlines()) # remove newline characters
    text = text.lower()
    text = text.strip()
    return re.sub(r'\s{2,}', ' ', text)
 def remove_retw(text):
    text = re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+', '', text)
    return re.sub(r'@[\S]+', '', text)
 def preprocess_text(text):
    text = remove_URL(text)
    text = remove_emoji(text)
    text = remove_html(text)
    text = remove_punct(text)
    text = remove_nonascii(text)
    text = remove_spec(text)
    text = remove_spaces(text)
    text = remove_retw(text)
    return text
 def preprocess_text_series(series):
    series = series.apply(remove_URL)
    series = series.apply(remove_emoji)
    series = series.apply(remove_html)
    series = series.apply(remove_punct)
    series = series.apply(remove_nonascii)
    series = series.apply(remove_spec)
    series = series.apply(remove_spaces)
    series = series.apply(remove_retw)
    return series
 # Check all functions:
 input_text = """
    Check out this amazing website: https://www.example.com! 😃
    <html>This is an HTML tag.</html>
    RT @user123: Just received a package from @companyXYZ. It's awesome! 📦
    This is a test text with lots of punctuations!!! Can't wait to see more...
 """
 processed_text = preprocess_text(input_text)
 # print(processed_text)
--- a/models/CovClass/2023-08-15_01-55-11/statsTopicClassification-2023-08-15_01-53-12.csv
+++ b/models/CovClass/2023-08-15_01-55-11/statsTopicClassification-2023-08-15_01-53-12.csv
@@ -0,0 +1,7 @@
 epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
 1,0.39025546515679493,0.40877932761593355,0.9103260869565217,0:10:21,0:00:40
 2,0.3057803610952067,0.3502063500978377,0.9103260869565217,0:10:53,0:00:43
 3,0.17910970049364833,0.27903796154904464,0.9375,0:10:30,0:00:38
 4,0.09279396105943587,0.41342766528301267,0.904891304347826,0:11:03,0:00:43
 5,0.06132459050129317,0.4468563502887264,0.9239130434782609,0:12:07,0:00:44
 6,0.04195396880810895,0.4350045176675928,0.9266304347826086,0:11:21,0:00:40
--- a/models/CovClass/2023-08-15_02-14-21/2023-08-15_02-14-21.csv
+++ b/models/CovClass/2023-08-15_02-14-21/2023-08-15_02-14-21.csv
@@ -0,0 +1,7 @@
 epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
 1,0.6699380816093513,0.6216431430407933,0.6964285714285714,0:01:03,0:00:02
 2,0.6649796058024678,0.621175297669002,0.6964285714285714,0:01:03,0:00:01
 3,0.642247314964022,0.6377243144171578,0.6964285714285714,0:01:05,0:00:02
 4,0.6300328698541436,0.6038827853543418,0.6964285714285714,0:01:04,0:00:02
 5,0.544977219509227,0.6619421115943364,0.625,0:01:02,0:00:02
 6,0.3951783587357828,0.48477122613361906,0.7857142857142857,0:01:05,0:00:01
--- a/models/CovClass/2023-08-15_05-56-50/2023-08-15_05-56-50.csv
+++ b/models/CovClass/2023-08-15_05-56-50/2023-08-15_05-56-50.csv
@@ -0,0 +1,7 @@
 epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
 1,0.5610552686641376,0.4569096086310089,0.9116022099447514,0:37:20,0:00:31
 2,0.43647773836513126,0.5441495520680196,0.9005524861878453,0:36:14,0:00:30
 3,0.288773139899344,0.43471020716692715,0.9392265193370166,0:36:10,0:00:29
 4,0.19330878817686287,0.4555162174395349,0.9281767955801105,0:36:17,0:00:30
 5,0.09109889855869348,0.5060150003684702,0.9281767955801105,0:36:13,0:00:30
 6,0.05734757932275739,0.6043995772428771,0.9226519337016574,0:36:11,0:00:31
--- a/models/FakeClass/2023-08-15_12-03-05/2023-08-15_12-03-05.csv
+++ b/models/FakeClass/2023-08-15_12-03-05/2023-08-15_12-03-05.csv
@@ -0,0 +1,7 @@
 epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
 1,0.21681843259712502,0.0005426188472483773,1.0,0:01:13,0:00:02
 2,0.00016121647037353423,0.0002873415878639207,1.0,0:01:12,0:00:02
 3,6.752021149355535e-05,0.00024319994372490328,1.0,0:01:12,0:00:02
 4,4.7950222591787355e-05,0.00022139604243420763,1.0,0:01:13,0:00:02
 5,3.99839740138679e-05,0.00021302999493855168,1.0,0:01:11,0:00:02
 6,3.5356899656214995e-05,0.00020912183117616223,1.0,0:01:13,0:00:02
--- a/preTestClassification.py
+++ b/preTestClassification.py
@@ -1,13 +1,8 @@
 import re
 import string
 import numpy as np
 import pandas as pd
 from datetime import datetime
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 from datasets import load_dataset
 from transformers.pipelines.pt_utils import KeyDataset
 from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
 #%%
 # prepare
@@ -40,7 +35,6 @@ senCSVPretest = "Pretest.csv"
 senCSVPretestPrep = "Pretest-Prep.csv"
 senCSVPretestResult = "Pretest-Results.csv"
 # don't change this one
 senCSVPath = wd + ud + senCSV
 senCSVcPath = wd + ud + senCSVc
@@ -50,6 +44,11 @@ senCSVcPretestResultPath = wd + ud + senCSVPretestResult
 preTestIDsFakePath = wd + di + preTestIDsFake
 preTestIDsNotPath = wd + di + preTestIDsNot
 import sys
 funs = wd+"funs"
 sys.path.insert(1, funs)
 import CleanTweets
 # List of IDs to select
 # Read the IDs from a file
 preTestIDsFakeL = []
@@ -85,11 +84,7 @@ tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
 # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
-dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
+dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(CleanTweets.preprocess_text)
 dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
 dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html)
 dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct)
 dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower())
 #%%
 timeStart = datetime.now() # start counting execution time
--- a/profiler.py
+++ b/profiler.py
@@ -0,0 +1,55 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Aug  8 14:49:02 2023
@author: michael
 """
 import pandas as pd
 import pandas_profiling as pp
 import numpy
 ###################
 # Setup directories
 # WD Michael
 wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 # datafile input directory
 di = "data/IN/"
 # Tweet-datafile output directory
 ud = "data/OUT/"
 # Name of file that all senator data will be written to
 senCSV = "ALL-SENATORS-TWEETS.csv"
 # Name of file that all senator data will be written to
 senDataset = "senators-raw.csv"
 # Name of new datafile generated
 senCSVc = "SenatorsTweets-Final"
 senCSVcCov = "SenatorsTweets-OnlyCov"
 # don't change this one
 senCSVPath = wd + ud + senCSV
 senCSVcPath = wd + ud + senCSVc + ".csv"
 senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
 senSAVcPath = wd + ud + senCSV + ".sav"
 senDTAcPath = wd + ud + senCSV + ".dta"
 senDatasetPath = wd + di + senDataset
 # forming dataframe and printing
 df = pd.read_csv(senCSVPath, dtype=(object))
 # forming ProfileReport and save
 # as output.html file
 profileAll = pp.ProfileReport(df, minimal=True)
 profileAll.to_file("data/OUT/profiles/AllTweets.html")
 df = pd.read_csv(senCSVcCovPath, dtype=(object))
 profileAll = pp.ProfileReport(df, minimal=True)
 profileAll.to_file("data/OUT/profiles/CovTweets.html")
--- a/repairmystupidity.py
+++ b/repairmystupidity.py
@@ -0,0 +1,35 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon Aug 14 20:47:22 2023
@author: michael
 """
 import pandas as pd
 wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # datafile input directory
 di = "data/IN/"
 # Tweet-datafile output directory
 ud = "data/OUT/"
 falsch = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct.csv"
 richtig = wd + ud + "SenatorsTweets-Training.csv"
 correct = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct2.csv"
 # Name of new datafile generated
 senCSVprep = "SenatorsTweets-Training_WORKING-COPY-prepared"
 # don't change this one
 falsch = pd.read_csv(falsch, dtype=(object), sep=";")
 richtig = pd.read_csv(richtig, dtype=(object))
 df = pd.merge(falsch,richtig[['tid','rawContent', 'date']],on='tid', how='left')
 df.drop(columns=['rawContent_x', 'date_x'], inplace=True)
 df.rename(columns={'tid_y':'tid', 'rawContent_y':'rawContent', 'date_y':'date'}, inplace=True)
 df = df[['tid','date','topicCovid','fake','rawContent','Unnamed: 6']]
 df.rename(columns={'Unnamed: 6':'comment'}, inplace=True)
 df.to_csv(correct, encoding='utf-8', sep=";")
--- a/trainFake.py
+++ b/trainFake.py
@@ -0,0 +1,613 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sat Aug 12 12:25:18 2023
@author: michael
 """
 #from datasets import load_dataset
 #from transformers import Trainer
 #from transformers import AutoModelForSequenceClassification
 from transformers import AutoTokenizer
 import torch 
 import numpy as np
 from sklearn.model_selection import train_test_split # pip install scikit-learn
 import pandas as pd
 ## Uses snippets from this guide:
 # https://mccormickml.com/2019/07/22/BERT-fine-tuning/
 ###################
 # Setup directories
 # WD Michael
 wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 import sys
 funs = wd+"funs"
 sys.path.insert(1, funs)
 import CleanTweets
 # datafile input directory
 di = "data/IN/"
 # Tweet-datafile output directory
 ud = "data/OUT/"
 # Training CSV dataset
 twtCSV = "SenatorsTweets-Training_WORKING-COPY-correct2"
 twtCSVtrainCovClass = "SenatorsTweets-train-CovClassification"
 twtCSVtrainFakeClass = "SenatorsTweets-train-FakeClassification"
 statsTrainingTopicClass = "statsTopicClassification-"
 # don't change this one
 twtCSVPath = wd + ud + twtCSV + ".csv"
 twtCSVtrainCovClassPath = wd + ud + twtCSVtrainCovClass + ".csv"
 twtCSVtrainFakeClassPath = wd + ud + twtCSVtrainFakeClass + ".csv"
 statsTrainingTopicClassPath = wd + ud + statsTrainingTopicClass
 twtCSVtrainCovClassPathTrain = wd + ud + twtCSVtrainCovClass + "TRAIN.csv"
 twtCSVtrainFakeClassPathTrain = wd + ud + twtCSVtrainFakeClass + "TRAIN.csv"
 twtTSVtrainCovClassPathTrain = wd + ud + "cov-train.tsv"
 twtTSVtrainFakeClassPathTrain = wd + ud + "fake-train.tsv"
 twtTSVtrainCovClassPathEval = wd + ud + "cov-eval.tsv" 
 twtTSVtrainFakeClassPathEval = wd + ud + "fake-eval.tsv"
 seed = 12355
 # Model paths
 modCovClassPath = wd + "models/CovClass/"
 modFakeClassPath = wd + "models/FakeClass/"
 model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' # accuracy 69
 #model_name = 'justinqbui/bertweet-covid19-base-uncased-pretraining-covid-vaccine-tweets' #48
 #model_name = "cardiffnlp/tweet-topic-latest-multi"
 model_name = "bvrau/covid-twitter-bert-v2-struth"
 #model_name = "cardiffnlp/roberta-base-tweet-topic-single-all"
 model_fake_name = 'bvrau/covid-twitter-bert-v2-struth' 
 # More models for fake detection:
 # https://huggingface.co/justinqbui/bertweet-covid-vaccine-tweets-finetuned
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 max_length = 64 # max token sentence length
 #%%
 # Create training and testing dataset
 dfTest = pd.read_csv(twtCSVPath, dtype=(object), delimiter=";")
 #dfTest = dfTest[:-900] # remove last 800 rows
 #dfTest = dfTest.iloc[:,:-3] # remove last 800 rows
 dfTest['text'] = dfTest['rawContent'].apply(CleanTweets.preprocess_roberta)
 dfTest.drop(columns=['rawContent'], inplace=True)
 # Only keep tweets that are longer than 3 words
 dfTest['tweet_proc_length'] = [len(text.split(' ')) for text in dfTest['text']]
 dfTest['tweet_proc_length'].value_counts()
 dfTest = dfTest[dfTest['tweet_proc_length']>3]
 dfTest = dfTest.drop_duplicates(subset=['text'])
 dfTest = dfTest.drop(columns=['date', 'Unnamed: 0']) 
 # Create datasets for each classification
 dfCovClass = dfTest
 dfFakeClass = dfTest
 dfCovClass = dfCovClass.drop(columns=['fake']) # fake column not neeeded in covid topic classification data
 dfFakeClass = dfFakeClass[dfFakeClass['topicCovid']=='True'].drop(columns=['topicCovid']) # topicCovid column not neeeded in covid topic classification data
 #type_map = {'Covid tweet': 'covid tweets', 'Noncovid tweet': 'noncovid tweet'}
 dfCovClass.rename(index = str, columns={'topicCovid': 'labels', 'tid': 'id'}, inplace = True)
 dfCovClass.labels = dfCovClass.labels.replace({"True": 'Covid', "False": 'NonCovid'})
 #type_map = {'fake news tweet': 'fake news tweet', 'non-fake-news-tweet': 'non-fake-news-tweet'}
 dfFakeClass.rename(index = str, columns={'fake': 'labels', 'tid': 'id'}, inplace = True)
 #%%
 # Tokenize tweets
 dfCovClass = dfCovClass[dfCovClass['labels'].notna()]
 dfFakeClass['labels'].replace({'Check': '','check': '', 'FALSE':''}, inplace=True)
 dfFakeClass = dfFakeClass[dfFakeClass['labels'].notna()]
 dfCovClass['input_ids'] = dfCovClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
 dfFakeClass['input_ids'] = dfFakeClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
 def encode_labels(label):
    if label == 'Covid':
        return 1
    elif label == 'NonCovid':
        return 0
    elif label == 'False':
        return 1
    elif label == 'True':
        return 0
    return 0
 dfCovClass['labels_encoded'] = dfCovClass['labels'].apply(encode_labels)
 dfFakeClass['labels_encoded'] = dfFakeClass['labels'].apply(encode_labels)
 dfFakeClass = dfFakeClass[dfFakeClass['labels']!=""]
 #dfFakeClass = dfFakeClass[(dfFakeClass['labels']=="Fake") | (dfFakeClass['labels']=="True")]
 # get n of classes
 print("# of Non-Covid tweets (coded 0):")
 print(dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
 # 62 non-covid tweets, disproportionate sample for training has to be 124 tweets
 print("# of Fake-news tweets (coded 1):")
 print(dfFakeClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
 # create disproportionate sample - 50/50 of both
 #dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique()
 #dfCovClass = dfCovClass.groupby('labels_encoded', group_keys=False).apply(lambda x: x.sample(164, random_state=seed))
 # after a lot of tests, it seems that a sample in which non-fake news tweets are overrepresented leads to better results.
 # because of this, performance limitations and time constraints, group 1 (covid topic) will be overrepresented (twice as many), which still doesn't reflect the real preoportions ~10/1
 '''dfCovClassa = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(1).sample(frac=1, replace=True).reset_index()
 dfCovClassb = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(0).sample(frac=1, replace=True).reset_index()
 dfCovClassab= pd.concat([dfCovClassa,dfCovClassb]) 
 dfCovClassab.reset_index(inplace=True)
 dfCovClass_train, dfCovClass_test = train_test_split(dfCovClassab, test_size=0.1, random_state=seed, stratify=dfCovClassab['labels_encoded'])
 '''
 # create training and validation samples
 dfFakeClass_train, dfFakeClass_test = train_test_split(dfFakeClass, test_size=0.1, random_state=seed, stratify=dfFakeClass['labels_encoded'])
 # reset index and drop unnecessary columns
 dfFakeClass_train.reset_index(drop=True, inplace=True)
 dfFakeClass_train.drop(inplace=True, columns=['tweet_proc_length'])
 dfFakeClass_train.groupby('labels_encoded', group_keys=False)['id'].nunique()
 dfFakeClass_test.reset_index(drop=True, inplace=True)
 dfFakeClass_test.drop(inplace=True, columns=['tweet_proc_length'])
 dfFakeClass_test.groupby('labels_encoded', group_keys=False)['id'].nunique()
 # save dfs as csvs and tsvs, for training and validation
 # covid classification datafiles
 # rows 0-41 = noncovid, 42-81 covid, therfore:
 #dfCovClass = dfCovClass.drop(columns=['tweet_proc_length'])
 #dfCovClass.reset_index(inplace=True, drop=True)
 #dfCovClass.loc[np.r_[0:31, 42:71], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPathTrain, encoding='utf-8', sep=";") 
 #dfCovClass.loc[np.r_[0:31, 42:72], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathTrain, encoding='utf-8', sep="\t")
 #dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPath, encoding='utf-8', sep=";")
 #dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathEval, encoding='utf-8', sep="\t")
 # fake news classification datafiles
 #dfFakeClass = dfFakeClass.drop(columns=['tweet_proc_length'])
 #dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPathTrain, encoding='utf-8', sep=";")
 #dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathTrain, encoding='utf-8', sep="\t")
 #dfFakeClass[0:199].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPath, encoding='utf-8', sep=";")
 #dfFakeClass[0:199].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathEval, encoding='utf-8', sep="\t")
 #%%
 # Prepare trainer
 #from transformers import TrainingArguments
 #training_args = TrainingArguments(
 #     report_to = 'wandb',
 #    output_dir=wd+'results',          # output directory/
 #   overwrite_output_dir = True,
 #    num_train_epochs=6,              # total number of training epochs
 #    per_device_train_batch_size=8,  # batch size per device during training
 #    per_device_eval_batch_size=16,   # batch size for evaluation
 #    learning_rate=2e-5,
 #    warmup_steps=1000,                # number of warmup steps for learning rate scheduler
 #    weight_decay=0.01,               # strength of weight decay
 #    logging_dir='./logs3',            # directory for storing logs
 #    logging_steps=1000,
 #    evaluation_strategy="epoch",
 #    save_strategy="epoch",
 #    load_best_model_at_end=True
 #)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 from transformers import BertForSequenceClassification, AdamW#, BertConfig
 #from torch.utils.data import TensorDataset, random_split
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 """
 train_dataset = load_dataset('csv', data_files={'train': twtCSVtrainCovClassPathTrain}, encoding = "utf-8")
 train_dataset = train_dataset['train'] 
 eval_dataset = load_dataset('csv', data_files={'test': twtCSVtrainCovClassPath}, encoding = "utf-8")
 eval_dataset = eval_dataset['test'] 
 """
 batch_size = 1
 from torch.utils.data import Dataset
 class PandasDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.dataframe)
    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        text = row['text']
        labels = row['labels_encoded']
        encoded = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True)
        input_ids = torch.tensor(encoded['input_ids'])
        attention_mask = torch.tensor(encoded['attention_mask'])
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(labels)  # Assuming labels are already encoded
        }
 train_dataset = PandasDataset(dfFakeClass_train, tokenizer, max_length)
 train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
 )
 eval_dataset = PandasDataset(dfFakeClass_test, tokenizer, max_length)
 validation_dataloader = DataLoader(
    eval_dataset,
    sampler=SequentialSampler(eval_dataset),
    batch_size=batch_size
 )
 for idx, batch in enumerate(train_dataloader):
    print('Batch index: ', idx)
    print('Batch size: ', batch['input_ids'].size())  # Access 'input_ids' field
    print('Batch label: ', batch['labels'])           # Access 'labels' field
    break
 model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
 )
 #trainer = Trainer(
 #    model=model,                         # the instantiated 🤗 Transformers model to be trained
 #    args=training_args,                  # training arguments, defined above
 #    train_dataset=train_dataset,         # training dataset
 #    eval_dataset=eval_dataset             # evaluation dataset
 #)
 # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
 # I believe the 'W' stands for 'Weight Decay fix"
 optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
 from transformers import get_linear_schedule_with_warmup
 # Number of training epochs. The BERT authors recommend between 2 and 4. 
 # We chose to run for 6
 epochs = 6
 # Total number of training steps is [number of batches] x [number of epochs]. 
 # (Note that this is not the same as the number of training samples).
 total_steps = len(train_dataloader) * epochs
 # Create the learning rate scheduler.
 scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
 # Function to calculate the accuracy of our predictions vs labels
 def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
 import time
 import datetime
 def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))
 import random
 # This training code is based on the `run_glue.py` script here:
 # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
 # Set the seed value all over the place to make this reproducible.
 seed_val = 12355
 # If there's a GPU available...
 if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    #model.cuda()
 # If not...
 else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
 device = torch.device("cpu")
 random.seed(seed_val)
 np.random.seed(seed_val)
 torch.manual_seed(seed_val)
 torch.cuda.manual_seed_all(seed_val)
 #%%
 # Start training
 # We'll store a number of quantities such as training and validation loss, 
 # validation accuracy, and timings.
 training_stats = []
 # Measure the total training time for the whole run.
 total_t0 = time.time()
 # For each epoch...
 for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('{:>5,} steps per batch will be calculated.'.format(len(train_dataloader)))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    model.to(device)
    # Reset the total loss for this epoch.
    total_train_loss = 0
    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        # Progress update every 10 batches.
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        print("Batch keys:", batch.keys())
        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        
        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = output[0]
        logits = output[1]
        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        
            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = output[0]
            logits = output[1]
        # Accumulate the validation loss.
        total_eval_loss += loss.item()
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
 print("")
 print("Training complete!")
 print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
 params = list(model.named_parameters())
 print('The BERT model has {:} different named parameters.\n'.format(len(params)))
 print('==== Embedding Layer ====\n')
 for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
 print('\n==== First Transformer ====\n')
 for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
 print('\n==== Output Layer ====\n')
 for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
 import os
 # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
 from datetime import datetime as dt
 fTimeFormat = "%Y-%m-%d_%H-%M-%S"
 now = dt.now().strftime(fTimeFormat)
 output_dir = modFakeClassPath + now + "/"
 # Create output directory if needed
 if not os.path.exists(output_dir):
    os.makedirs(output_dir)
 print("Saving model to %s" % output_dir)
 # Save a trained model, configuration and tokenizer using `save_pretrained()`.
 # They can then be reloaded using `from_pretrained()`
 model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
 model_to_save.save_pretrained(output_dir)
 tokenizer.save_pretrained(output_dir)
 # Good practice: save your training arguments together with the trained model
 # torch.save(args, os.path.join(output_dir, 'training_args.bin'))
 import pandas as pd
 # Display floats with two decimal places.
 pd.set_option('display.precision', 2)
 # Create a DataFrame from our training statistics.
 df_stats = pd.DataFrame(data=training_stats)
 # Use the 'epoch' as the row index.# Good practice: save your training arguments together with the trained model
 df_stats = df_stats.set_index('epoch')
 # A hack to force the column headers to wrap.
 #df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
 # Display the table.
 df_stats
 df_stats.to_csv(output_dir + now + ".csv")
--- a/trainTopic.py
+++ b/trainTopic.py
@@ -0,0 +1,607 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Sat Aug 12 12:25:18 2023
@author: michael
 """
 #from datasets import load_dataset
 #from transformers import Trainer
 #from transformers import AutoModelForSequenceClassification
 from transformers import AutoTokenizer
 import torch 
 import numpy as np
 from sklearn.model_selection import train_test_split # pip install scikit-learn
 import pandas as pd
 ## Uses snippets from this guide:
 # https://mccormickml.com/2019/07/22/BERT-fine-tuning/
 ###################
 # Setup directories
 # WD Michael
 wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 import sys
 funs = wd+"funs"
 sys.path.insert(1, funs)
 import CleanTweets
 # datafile input directory
 di = "data/IN/"
 # Tweet-datafile output directory
 ud = "data/OUT/"
 # Training CSV dataset
 twtCSV = "SenatorsTweets-Training_WORKING-COPY-correct2"
 twtCSVtrainCovClass = "SenatorsTweets-train-CovClassification"
 twtCSVtrainFakeClass = "SenatorsTweets-train-FakeClassification"
 statsTrainingTopicClass = "statsTopicClassification-"
 # don't change this one
 twtCSVPath = wd + ud + twtCSV + ".csv"
 twtCSVtrainCovClassPath = wd + ud + twtCSVtrainCovClass + ".csv"
 twtCSVtrainFakeClassPath = wd + ud + twtCSVtrainFakeClass + ".csv"
 statsTrainingTopicClassPath = wd + ud + statsTrainingTopicClass
 twtCSVtrainCovClassPathTrain = wd + ud + twtCSVtrainCovClass + "TRAIN.csv"
 twtCSVtrainFakeClassPathTrain = wd + ud + twtCSVtrainFakeClass + "TRAIN.csv"
 twtTSVtrainCovClassPathTrain = wd + ud + "cov-train.tsv"
 twtTSVtrainFakeClassPathTrain = wd + ud + "fake-train.tsv"
 twtTSVtrainCovClassPathEval = wd + ud + "cov-eval.tsv" 
 twtTSVtrainFakeClassPathEval = wd + ud + "fake-eval.tsv"
 seed = 12355
 # Model paths
 modCovClassPath = wd + "models/CovClass/"
 modFakeClassPath = wd + "models/FakeClass/"
 model_name = "bvrau/covid-twitter-bert-v2-struth"
 model_fake_name = 'bvrau/covid-twitter-bert-v2-struth' 
 # More models for fake detection:
 # https://huggingface.co/justinqbui/bertweet-covid-vaccine-tweets-finetuned
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 max_length = 64 # max token sentence length
 #%%
 # Create training and testing dataset
 dfTest = pd.read_csv(twtCSVPath, dtype=(object), delimiter=";")
 #dfTest = dfTest[:-900] # remove last 800 rows
 #dfTest = dfTest.iloc[:,:-3] # remove last 800 rows
 dfTest['text'] = dfTest['rawContent'].apply(CleanTweets.preprocess_roberta)
 dfTest.drop(columns=['rawContent'], inplace=True)
 # Only keep tweets that are longer than 3 words
 dfTest['tweet_proc_length'] = [len(text.split(' ')) for text in dfTest['text']]
 dfTest['tweet_proc_length'].value_counts()
 dfTest = dfTest[dfTest['tweet_proc_length']>3]
 dfTest = dfTest.drop_duplicates(subset=['text'])
 dfTest = dfTest.drop(columns=['date', 'Unnamed: 0']) 
 # Create datasets for each classification
 dfCovClass = dfTest
 dfFakeClass = dfTest
 dfCovClass = dfCovClass.drop(columns=['fake']) # fake column not neeeded in covid topic classification data
 dfFakeClass = dfFakeClass[dfFakeClass['topicCovid']=='True'].drop(columns=['topicCovid']) # topicCovid column not neeeded in covid topic classification data
 #type_map = {'Covid tweet': 'covid tweets', 'Noncovid tweet': 'noncovid tweet'}
 dfCovClass.rename(index = str, columns={'topicCovid': 'labels', 'tid': 'id'}, inplace = True)
 dfCovClass.labels = dfCovClass.labels.replace({"True": 'Covid', "False": 'NonCovid'})
 #type_map = {'fake news tweet': 'fake news tweet', 'non-fake-news-tweet': 'non-fake-news-tweet'}
 dfFakeClass.rename(index = str, columns={'fake': 'labels', 'tid': 'id'}, inplace = True)
 dfFakeClass.labels = dfFakeClass.labels.replace({"True": 'Fake', "False": 'True'})
 #%%
 # Tokenize tweets
 dfCovClass = dfCovClass[dfCovClass['labels'].notna()]
 dfFakeClass = dfFakeClass[dfFakeClass['labels'].notna()]
 dfCovClass['input_ids'] = dfCovClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
 dfFakeClass['input_ids'] = dfFakeClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
 def encode_labels(label):
    if label == 'Covid':
        return 1
    elif label == 'NonCovid':
        return 0
    elif label == 'Fake':
        return 1
    elif label == 'True':
        return 0
    return 0
 dfCovClass['labels_encoded'] = dfCovClass['labels'].apply(encode_labels)
 dfFakeClass['labels_encoded'] = dfFakeClass['labels'].apply(encode_labels)
 # get n of classes
 print("# of Non-Covid tweets (coded 0):")
 print(dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
 # 62 non-covid tweets, disproportionate sample for training has to be 124 tweets
 print("# of Fake-news tweets (coded 1):")
 print(dfFakeClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
 # create disproportionate sample - 50/50 of both
 #dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique()
 #dfCovClass = dfCovClass.groupby('labels_encoded', group_keys=False).apply(lambda x: x.sample(164, random_state=seed))
 # after a lot of tests, it seems that a sample in which non-fake news tweets are overrepresented leads to better results.
 # because of this, performance limitations and time constraints, group 1 (covid topic) will be overrepresented (twice as many), which still doesn't reflect the real preoportions ~10/1
 '''dfCovClassa = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(1).sample(frac=1, replace=True).reset_index()
 dfCovClassb = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(0).sample(frac=1, replace=True).reset_index()
 dfCovClassab= pd.concat([dfCovClassa,dfCovClassb]) 
 dfCovClassab.reset_index(inplace=True)
 dfCovClass_train, dfCovClass_test = train_test_split(dfCovClassab, test_size=0.1, random_state=seed, stratify=dfCovClassab['labels_encoded'])
 '''
 # create training and validation samples
 dfCovClass_train, dfCovClass_test = train_test_split(dfCovClass, test_size=0.1, random_state=seed, stratify=dfCovClass['labels_encoded'])
 # reset index and drop unnecessary columns
 dfCovClass_train.reset_index(drop=True, inplace=True)
 dfCovClass_train.drop(inplace=True, columns=['tweet_proc_length'])
 dfCovClass_train.groupby('labels_encoded', group_keys=False)['id'].nunique()
 dfCovClass_test.reset_index(drop=True, inplace=True)
 dfCovClass_test.drop(inplace=True, columns=['tweet_proc_length'])
 dfCovClass_test.groupby('labels_encoded', group_keys=False)['id'].nunique()
 # save dfs as csvs and tsvs, for training and validation
 # covid classification datafiles
 # rows 0-41 = noncovid, 42-81 covid, therfore:
 #dfCovClass = dfCovClass.drop(columns=['tweet_proc_length'])
 #dfCovClass.reset_index(inplace=True, drop=True)
 #dfCovClass.loc[np.r_[0:31, 42:71], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPathTrain, encoding='utf-8', sep=";") 
 #dfCovClass.loc[np.r_[0:31, 42:72], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathTrain, encoding='utf-8', sep="\t")
 #dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPath, encoding='utf-8', sep=";")
 #dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathEval, encoding='utf-8', sep="\t")
 # fake news classification datafiles
 #dfFakeClass = dfFakeClass.drop(columns=['tweet_proc_length'])
 #dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPathTrain, encoding='utf-8', sep=";")
 #dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathTrain, encoding='utf-8', sep="\t")
 #dfFakeClass[0:199].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPath, encoding='utf-8', sep=";")
 #dfFakeClass[0:199].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathEval, encoding='utf-8', sep="\t")
 #%%
 # Prepare trainer
 #from transformers import TrainingArguments
 #training_args = TrainingArguments(
 #     report_to = 'wandb',
 #    output_dir=wd+'results',          # output directory/
 #   overwrite_output_dir = True,
 #    num_train_epochs=6,              # total number of training epochs
 #    per_device_train_batch_size=8,  # batch size per device during training
 #    per_device_eval_batch_size=16,   # batch size for evaluation
 #    learning_rate=2e-5,
 #    warmup_steps=1000,                # number of warmup steps for learning rate scheduler
 #    weight_decay=0.01,               # strength of weight decay
 #    logging_dir='./logs3',            # directory for storing logs
 #    logging_steps=1000,
 #    evaluation_strategy="epoch",
 #    save_strategy="epoch",
 #    load_best_model_at_end=True
 #)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 from transformers import BertForSequenceClassification, AdamW#, BertConfig
 #from torch.utils.data import TensorDataset, random_split
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 """
 train_dataset = load_dataset('csv', data_files={'train': twtCSVtrainCovClassPathTrain}, encoding = "utf-8")
 train_dataset = train_dataset['train'] 
 eval_dataset = load_dataset('csv', data_files={'test': twtCSVtrainCovClassPath}, encoding = "utf-8")
 eval_dataset = eval_dataset['test'] 
 """
 batch_size = 1
 from torch.utils.data import Dataset
 class PandasDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.dataframe)
    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        text = row['text']
        labels = row['labels_encoded']
        encoded = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True)
        input_ids = torch.tensor(encoded['input_ids'])
        attention_mask = torch.tensor(encoded['attention_mask'])
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(labels)  # Assuming labels are already encoded
        }
 train_dataset = PandasDataset(dfCovClass_train, tokenizer, max_length)
 train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
 )
 eval_dataset = PandasDataset(dfCovClass_test, tokenizer, max_length)
 validation_dataloader = DataLoader(
    eval_dataset,
    sampler=SequentialSampler(eval_dataset),
    batch_size=batch_size
 )
 for idx, batch in enumerate(train_dataloader):
    print('Batch index: ', idx)
    print('Batch size: ', batch['input_ids'].size())  # Access 'input_ids' field
    print('Batch label: ', batch['labels'])           # Access 'labels' field
    break
 model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
 )
 #trainer = Trainer(
 #    model=model,                         # the instantiated 🤗 Transformers model to be trained
 #    args=training_args,                  # training arguments, defined above
 #    train_dataset=train_dataset,         # training dataset
 #    eval_dataset=eval_dataset             # evaluation dataset
 #)
 # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
 # I believe the 'W' stands for 'Weight Decay fix"
 optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
 from transformers import get_linear_schedule_with_warmup
 # Number of training epochs. The BERT authors recommend between 2 and 4. 
 # We chose to run for 6
 epochs = 6
 # Total number of training steps is [number of batches] x [number of epochs]. 
 # (Note that this is not the same as the number of training samples).
 total_steps = len(train_dataloader) * epochs
 # Create the learning rate scheduler.
 scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
 # Function to calculate the accuracy of our predictions vs labels
 def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
 import time
 import datetime
 def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))
 import random
 # This training code is based on the `run_glue.py` script here:
 # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
 # Set the seed value all over the place to make this reproducible.
 seed_val = 12355
 # If there's a GPU available...
 if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    #model.cuda()
 # If not...
 else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
 device = torch.device("cpu")
 random.seed(seed_val)
 np.random.seed(seed_val)
 torch.manual_seed(seed_val)
 torch.cuda.manual_seed_all(seed_val)
 #%%
 # Start training
 # We'll store a number of quantities such as training and validation loss, 
 # validation accuracy, and timings.
 training_stats = []
 # Measure the total training time for the whole run.
 total_t0 = time.time()
 # For each epoch...
 for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('{:>5,} steps per batch will be calculated.'.format(len(train_dataloader)))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    model.to(device)
    # Reset the total loss for this epoch.
    total_train_loss = 0
    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        # Progress update every 10 batches.
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        print("Batch keys:", batch.keys())
        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        
        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = output[0]
        logits = output[1]
        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        
            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = output[0]
            logits = output[1]
        # Accumulate the validation loss.
        total_eval_loss += loss.item()
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
 print("")
 print("Training complete!")
 print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
 params = list(model.named_parameters())
 print('The BERT model has {:} different named parameters.\n'.format(len(params)))
 print('==== Embedding Layer ====\n')
 for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
 print('\n==== First Transformer ====\n')
 for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
 print('\n==== Output Layer ====\n')
 for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
 import os
 # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
 from datetime import datetime as dt
 fTimeFormat = "%Y-%m-%d_%H-%M-%S"
 now = dt.now().strftime(fTimeFormat)
 output_dir = modCovClassPath + now + "/"
 # Create output directory if needed
 if not os.path.exists(output_dir):
    os.makedirs(output_dir)
 print("Saving model to %s" % output_dir)
 # Save a trained model, configuration and tokenizer using `save_pretrained()`.
 # They can then be reloaded using `from_pretrained()`
 model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
 model_to_save.save_pretrained(output_dir)
 tokenizer.save_pretrained(output_dir)
 # Good practice: save your training arguments together with the trained model
 # torch.save(args, os.path.join(output_dir, 'training_args.bin'))
 import pandas as pd
 # Display floats with two decimal places.
 pd.set_option('display.precision', 2)
 # Create a DataFrame from our training statistics.
 df_stats = pd.DataFrame(data=training_stats)
 # Use the 'epoch' as the row index.# Good practice: save your training arguments together with the trained model
 df_stats = df_stats.set_index('epoch')
 # A hack to force the column headers to wrap.
 #df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
 # Display the table.
 df_stats
 df_stats.to_csv(output_dir + now + ".csv")
Author	SHA1	Message	Date
Michael Beck	80b63b39df	adds readme	2023-08-30 21:45:38 +02:00
Michael Beck	d8136909c8	corrects import of own functions that didn't work anymore because of a newer python version.	2023-08-30 21:45:27 +02:00
Michael Beck	1c6d9d5415	cleans and renames files	2023-08-30 21:18:55 +02:00
Michael Beck	4e08cde317	finishes classification scripts	2023-08-16 10:06:16 +02:00
Michael Beck	2535683cdc	finishes classification scripts	2023-08-15 14:51:28 +02:00
Michael Beck	8f744a08be	adds final counter keywords	2023-08-15 14:30:40 +02:00
Michael Beck	df5fd51a5f	repairs stupid	2023-08-15 14:30:13 +02:00
Michael Beck	3d4f559d2d	adds model training stats	2023-08-15 14:29:42 +02:00
Michael Beck	2e067b6a64	adds both classification scripts. Corrects inclusion of CleanTweets functions.	2023-08-15 14:23:56 +02:00
Michael Beck	7a16526a97	adds dataset profiles	2023-08-15 14:20:13 +02:00
Michael Beck	b89b5969ec	adds typerror controls	2023-08-15 14:19:33 +02:00
Michael Beck	7c6b618272	adds both training scripts and evaluation files of topic classification	2023-08-15 14:19:08 +02:00
Michael Beck	90aa58239c	adds generation of model-training dataset	2023-08-14 15:37:30 +02:00
Michael Beck	1beff96ae9	adds model training code	2023-08-14 15:37:05 +02:00
Michael Beck	881d3d6d6d	adds tweet-text-cleaning functions	2023-08-14 15:36:46 +02:00
Michael Beck	5a63c478e9	adds dataset profiler	2023-08-08 15:32:12 +02:00
Michael Beck	ed61d52182	adds files to gitignore	2023-08-08 00:07:42 +02:00