29 Commits

Author SHA1 Message Date
89b4755c65 adds link to full package to readme 2023-08-31 01:23:38 +02:00
01e58b1b99 adds html files to gitignore 2023-08-31 01:21:31 +02:00
d0fcefedf4 data/OUT/profiles/CovTweets.html gelöscht 2023-08-31 01:20:39 +02:00
71cf907249 data/OUT/profiles/AllTweets.html gelöscht 2023-08-31 01:20:31 +02:00
a9018fedee REALLY corrects the filetree 2023-08-30 21:54:13 +02:00
d94a93295f corrects filetree 2023-08-30 21:53:05 +02:00
80b63b39df adds readme 2023-08-30 21:45:38 +02:00
d8136909c8 corrects import of own functions that didn't work anymore because of a newer python version. 2023-08-30 21:45:27 +02:00
1c6d9d5415 cleans and renames files 2023-08-30 21:18:55 +02:00
4e08cde317 finishes classification scripts 2023-08-16 10:06:16 +02:00
2535683cdc finishes classification scripts 2023-08-15 14:51:28 +02:00
8f744a08be adds final counter keywords 2023-08-15 14:30:40 +02:00
df5fd51a5f repairs stupid 2023-08-15 14:30:13 +02:00
3d4f559d2d adds model training stats 2023-08-15 14:29:42 +02:00
2e067b6a64 adds both classification scripts. Corrects inclusion of CleanTweets functions. 2023-08-15 14:23:56 +02:00
7a16526a97 adds dataset profiles 2023-08-15 14:20:13 +02:00
b89b5969ec adds typerror controls 2023-08-15 14:19:33 +02:00
7c6b618272 adds both training scripts and evaluation files of topic classification 2023-08-15 14:19:08 +02:00
90aa58239c adds generation of model-training dataset 2023-08-14 15:37:30 +02:00
1beff96ae9 adds model training code 2023-08-14 15:37:05 +02:00
881d3d6d6d adds tweet-text-cleaning functions 2023-08-14 15:36:46 +02:00
5a63c478e9 adds dataset profiler 2023-08-08 15:32:12 +02:00
ed61d52182 adds files to gitignore 2023-08-08 00:07:42 +02:00
a26d150060 renames pretest classification file 2023-08-08 00:06:18 +02:00
d791e4a293 adds classification file. adds removal of empty tweets after transormation for classification preparation 2023-08-08 00:04:14 +02:00
d57b7a31b7 adds more counter keywords 2023-08-08 00:03:30 +02:00
13d80124d3 adds lines with counterKeywords to remove non-covid tweets 2023-08-07 23:45:11 +02:00
3de6d8f3ec adds tweetLen column, converts keywords to lowercase and removes certain keywords 2023-08-07 23:07:29 +02:00
899a99ba72 adds CleanTweets functions, creates Graphs 2023-07-07 18:18:51 +02:00
23 changed files with 2131 additions and 92 deletions

2
.gitignore vendored
View File

@@ -2,6 +2,8 @@
**/*lock* **/*lock*
**/*-slice*.csv **/*-slice*.csv
**/*.zip **/*.zip
**/*.html
**/*.htm
/ALL-SENATORS-LONG.csv /ALL-SENATORS-LONG.csv
/ALL-SENATORS.csv /ALL-SENATORS.csv
/collect2.py /collect2.py

1
.vscode/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/settings.json

123
ClassificationFake.py Normal file
View File

@@ -0,0 +1,123 @@
import numpy as np
import pandas as pd
from datetime import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset
#%%
# prepare & define paths
# install xformers (pip install xformers) for better performance
###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
# Name of file that all senator data will be written to
senCSV = "Tweets-Classified-Topic-Results.csv"
# Name of Classify datafile
senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv"
senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv"
# don't change this one
senCSVPath = wd + ud + senCSV
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
import sys
funs = wd+"funs"
sys.path.insert(1, funs)
import CleanTweets
#%%
# get datafra,e
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
def encode_labels(label):
if label == 'True':
return 'False'
elif label == 'False':
return 'True'
return 0
dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
dfClassify.to_csv("/home/michael/Documents/PS/Data/collectTweets/data/OUT/Tweets-Classified-Topic-Results.csv", encoding='utf-8')
dfClassify = dfClassify[dfClassify['output_label_topicCov']=='True']
# dataframe from csv
dfClassify['fake'] = False
#%%
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
# HowTo:
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
#%%
# remove empty rows
dfClassify.cleanContent.replace('',np.nan,inplace=True)
dfClassify.dropna(subset=['cleanContent'], inplace=True)
#%%
timeStart = datetime.now() # start counting execution time
max_length = 128
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
#train.rename(columns={'target': 'labels'}, inplace=True)
#train.head()
# %%
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
#%%
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
# %%from datetime import datetime
#from tqdm.auto import tqdm
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
# print(out)
#%%
output_labels = []
output_score = []
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
output_labels.append(out['label'])
output_score.append(out['score'])
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
# Exactly the same output as before, but the content are passed
# as batches to the model
# %%
dfClassify['output_label_fake'] = output_labels
dfClassify['output_score_fake'] = output_score
timeEnd = datetime.now()
timeTotal = timeEnd - timeStart
timePerTweet = timeTotal / 96
print(f"Total classification execution time: {timeTotal} seconds")
print(f"Time per tweet classification: {timePerTweet}")
# %%
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
# %%

123
ClassificationTopic.py Normal file
View File

@@ -0,0 +1,123 @@
import numpy as np
import pandas as pd
from datetime import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset
#%%
# prepare & define paths
# install xformers (pip install xformers) for better performance
###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
# Name of file that all senator data will be written to
senCSV = "SenatorsTweets-OnlyCov.csv"
# Name of Classify datafile
senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
# don't change this one
senCSVPath = wd + ud + senCSV
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
import sys
funs = wd+"funs"
sys.path.insert(1, funs)
import CleanTweets
#%%
# get datafra,e
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
# dataframe from csv
dfClassify['fake'] = False
#%%
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
# HowTo:
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/CovClass/2023-08-15_05-56-50/")
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(CleanTweets.preprocess_text)
#%%
# remove empty rows
dfClassify.cleanContent.replace('',np.nan,inplace=True)
dfClassify.dropna(subset=['cleanContent'], inplace=True)
#%%
timeStart = datetime.now() # start counting execution time
max_length = 128
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
#train.rename(columns={'target': 'labels'}, inplace=True)
#train.head()
# %%
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
#%%
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
# %%from datetime import datetime
#from tqdm.auto import tqdm
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
# print(out)
#%%
output_labels = []
output_score = []
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
output_labels.append(out['label'])
output_score.append(out['score'])
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
# Exactly the same output as before, but the content are passed
# as batches to the model
# %%
dfClassify['output_label_topicCov'] = output_labels
dfClassify['output_score_topicCov'] = output_score
timeEnd = datetime.now()
timeTotal = timeEnd - timeStart
timePerTweet = timeTotal / 96
print(f"Total classification execution time: {timeTotal} seconds")
print(f"Time per tweet classification: {timePerTweet}")
# %%
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
# %%
## corrections
def encode_labels(label):
if label == 'real':
return 'True'
elif label == 'fake':
return 'False'
return 0
dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
#still wrong, will be corrected in ClassificationFake.py

132
README.md
View File

@@ -1,7 +1,131 @@
# How to use # Requirements
Execute collect.py to scrape tweets and generate the ´ALL-SENATORS-TWEETS.csv´. - python 3.10+
- snscrape 0.6.2.20230321+ (see git repo in this folder)
- transformers 4.31.0
- numpy 1.23.5
- pandas 2.0.3
- scikit-learn 1.3.0
- torch 2.0.1
Execute collectSenData.py to scrape senator data and generate ´ALL-SENATORS.csv´. # About
All new files will be written to ´data/OUT/´. Necessary data has to be located in ´data/IN/´ This collection of scripts scrapes tweets of US-senators in the time from 2020-01-01T00:00:00Z to 2023-01-03T00:00:00Z, scrapes account data of the senators, prepares the tweets for the training of a NLP-model, trains two models to (1) classify the tweets topic as covid or non-covid and (2) the tweets as either "fake news" tweets or "non-fake news" tweets.
Training only works with a prepared dataset in which the tweets are pre classified.
More info in the comments of the scripts.
Due to time constraints, most of the code is procedurally coded and ugly but effective.
# How to
Tested on Ubuntu 22.04.
If needed, the virual environment can be exported and send to you.
All files in the folder data/in have to exist in order to execute the scripts.
Execute in the following order:
01 collect.py (see more for further info on scraping)
02 collectSenData.py
03 cleanTweets
04 preTestClassification.py
05 trainTopic.py
06 trainFake.py
07 ClassificationFake.py
08 ClassificationTopic.py
# Files & Folders
Datafiles are not included in the repository but can be found in the full package that can be downloaded from [here](https://ncloud.mischbeck.de/s/T4QcMDSfYSkadYC) (password protected).
```
├── data
│   ├── IN
│   │   ├── counterKeywordsFinal.txt
│   │   ├── counterKeywords.txt
│   │   ├── keywords-raw.txt
│   │   ├── keywords.txt
│   │   ├── own_keywords.txt
│   │   ├── pretest-tweets_fake.txt contains tweet ids for pretest
│   │   ├── pretest-tweets_not_fake.txt contains tweet ids for pretest
│   │   └── senators-raw.csv senator datafile
│   ├── OUT
│   │   ├── ALL-SENATORS-TWEETS.csv
│   │   ├── graphs
│   │   │   ├── Timeline.png
│   │   │   ├── Wordcloud-All.png
│   │   │   └── Wordcloud-Cov.png
│   │   ├── Pretest-Prep.csv
│   │   ├── Pretest-Results.csv
│   │   ├── Pretest-SENATORS-TWEETS.csv
│   │   ├── profiles dataset profiles
│   │   │   ├── AllTweets.html
│   │   │   └── CovTweets.html
│   │   ├── SenatorsTweets-Final.csv
│   │   ├── SenatorsTweets-OnlyCov.csv
│   │   ├── SenatorsTweets-train-CovClassification.csv
│   │   ├── SenatorsTweets-train-CovClassificationTRAIN.csv
│   │   ├── SenatorsTweets-train-CovClassification.tsv
│   │   ├── SenatorsTweets-train-FakeClassification.csv
│   │   ├── SenatorsTweets-train-FakeClassificationTRAIN.csv
│   │   ├── SenatorsTweets-train-FakeClassification.tsv
│   │   ├── SenatorsTweets-Training.csv
│   │   ├── SenatorsTweets-Training_WORKING-COPY.csv
│   │   ├── topClass-PRETEST-Prep.csv
│   │   ├── topClass-PRETEST-Results.csv
│   │   ├── Tweets-All-slices.zip
│   │   ├── Tweets-Classified-Fake-Prep.csv
│   │   ├── Tweets-Classified-Fake-Results.csv
│   │   ├── Tweets-Classified-Prep.csv
│   │   ├── Tweets-Classified-Topic-Prep.csv
│   │   ├── Tweets-Classified-Topic-Results.csv
│   │   └── Tweets-Stub.csv
├── funs
│   ├── CleanTweets.py 2023-01-03T00:00:00Z multiple functions to clean tweet contents for NLN-processing
│   ├── ClearDupes.py function for deletion of duplicate keywords
│   ├── __init__.py
│   ├── Scrape.py scraper functions to be used for multiprocessing
│   └── TimeSlice.py time slice script to slice the time span in 24 slices, speeds up scraping through multiprocessing
├── log logs of the scraping process
│   ├── log_2023-06-23_21-06-10_err.log
│   ├── log_2023-06-23_21-06-10.log
│   └── log_2023-06-23_21-06-10_missing.log
├── models
│   ├── CovClass Covid tweet classification model
│   │   └── 2023-08-15_05-56-50
│   │   ├── 2023-08-15_05-56-50.csv training output
│   │   ├── config.json
│   │   ├── pytorch_model.bin
│   │   ├── special_tokens_map.json
│   │   ├── tokenizer_config.json
│   │   ├── tokenizer.json
│   │   └── vocab.txt
│   └── FakeClass Fake tweet classification model
│   └── 2023-08-15_14-35-43
│   ├── 2023-08-15_14-35-43.csv training output
│   ├── config.json
│   ├── pytorch_model.bin
│   ├── special_tokens_map.json
│   ├── tokenizer_config.json
│   ├── tokenizer.json
│   └── vocab.txt
├── snscrape contains snscrape 0.6.2.20230321+ git repo
├── ClassificationFake.py classifies tweets as fake or non-fake, saves:
│ Tweets-Classified-Fake-Prep.csv - prepared training dataset
│ Tweets-Classified-Fake-Results.csv - Tweets-Classified-Topic-Results.csv with cov classification results
├── ClassificationTopic.py classifies tweet topic, saves:
│ Tweets-Classified-Topic-Prep.csv - prepared training dataset
│ Tweets-Classified-Topic-Results.csv - SenatorsTweets-OnlyCov.csv with cov classification results
├── cleanTweets.py Curates keywordlists
│ Merges senator and tweet datasets
│ Creates multiple datasets:
│ SenatorsTweets-Final.csv - all tweets with keyword columns
│ SenatorsTweets-OnlyCov.csv - only covid tweets, filtered by keywordlist
│ SenatorsTweets-Training.csv - training dataset, containing ~1800 randomly selected tweets from SenatorsTweets-OnlyCov.csv
├── collect.py scrapes tweets, saves to ALL-SENATORS-TWEETS.csv
├── collectSenData.py scrapes senator account data, saves to ALL-SENATORS.csv
├── createGraphs.py creates wordcloud & timeline graphs
├── preTestClassification.py pretest script that uses bvrau/covid-twitter-bert-v2-struth to analyze 100 preclassified tweets
├── profiler.py creates dataset profiles
├── README.md readme
├── trainFake.py training script for the fake tweet classification model
└── trainTopic.py training script for the tweet topic classification model
```

View File

@@ -8,10 +8,13 @@ Created on Mon Jun 26 20:36:43 2023
import pandas as pd import pandas as pd
# import pyreadstat # import pyreadstat
# import numpy as np import numpy as np
from funs.ClearDupes import deDupe import sys
# Seet for training dataset generation
seed = 86431891
################### ###################
# Setup directories # Setup directories
# WD Michael # WD Michael
@@ -32,17 +35,26 @@ senCSV = "ALL-SENATORS-TWEETS.csv"
senDataset = "senators-raw.csv" senDataset = "senators-raw.csv"
# Name of new datafile generated # Name of new datafile generated
senCSVc = "Tweets-Cleaned" senCSVc = "SenatorsTweets-Final"
senCSVcCov = "SenatorsTweets-OnlyCov"
senCSVcTrain = "SenatorsTweets-Training"
# don't change this one # don't change this one
senCSVPath = wd + ud + senCSV senCSVPath = wd + ud + senCSV
senCSVcPath = wd + ud + senCSV + ".csv" senCSVcPath = wd + ud + senCSVc + ".csv"
senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
senCSVcTrainPath = wd + ud + senCSVcTrain + ".csv"
senSAVcPath = wd + ud + senCSV + ".sav" senSAVcPath = wd + ud + senCSV + ".sav"
senDTAcPath = wd + ud + senCSV + ".dta" senDTAcPath = wd + ud + senCSV + ".dta"
senDatasetPath = wd + di + senDataset senDatasetPath = wd + di + senDataset
df = pd.read_csv(senCSVPath, dtype=(object)) df = pd.read_csv(senCSVPath, dtype=(object))
## Import own functions
funs = wd+"funs"
sys.path.insert(1, funs)
from ClearDupes import deDupe
mixed_columns = df.columns[df.nunique() != len(df)] mixed_columns = df.columns[df.nunique() != len(df)]
print(mixed_columns) print(mixed_columns)
@@ -83,28 +95,51 @@ with open(f"{di}keywords-raw.txt", "r") as file:
for line in lines: for line in lines:
keyword = line.strip() # Remove the newline character keyword = line.strip() # Remove the newline character
keywords.append(keyword) keywords.append(keyword)
# delete keywords ppe and china that lead to too many false positives
removeWords = {'ppe', 'china'}
keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
keywords = [item for item in keywords if item not in removeWords ] # removes words
with open(f"{di}keywords.txt", "w") as file: with open(f"{di}keywords.txt", "w") as file:
print("read keyword files") print("read keyword files")
for line in keywords: for line in keywords:
file.write(f'{line}\n') file.write(f'{line}\n')
# counter keywords
# Read the keywords from a file
counterKeywords = []
with open(f"{di}counterKeywords.txt", "r") as file:
lines = file.readlines()
for line in lines:
counterKeyword = line.strip() # Remove the newline character
counterKeywords.append(counterKeyword)
counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
with open(f"{di}counterKeywordsFinal.txt", "w") as file:
print("read keyword files")
for line in counterKeywords:
file.write(f'{line}\n')
#%% #%%
# overwrite keyword column # overwrite keyword column
df['contains_keyword'] = '' df['keywords'] = np.nan
df['contains_keyword'] = ( df['keywords'] = (
df['rawContent'].str.findall('|'.join(keywords)).str.join(',').replace('', 'none') df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
)
df['counterKeywords'] = np.nan
df['counterKeywords'] = (
df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
) )
mask = (df['contains_keyword'] != 'none') # select all values in contains_keyword == 'none'
df.loc[mask,'keywords'] = df['contains_keyword'] # set keywords = contains_keyword under the condition of mask
#%% #%%
# create bool contains_keyword # create boolean contains_keyword column
df['contains_keyword'] = ~pd.isnull(df['keywords']) # create boolean column df['contains_keyword'] = True
#%% df['contains_counterKeyword'] = True
# recode contains keyword to bool mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
mask = (df['contains_keyword'] != 'none') df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
df.loc[mask,'contains_keyword'] = True mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none'
df.loc[~mask,'contains_keyword'] = False # ~ negates mask, selecting all values that do not contain keywords df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask
#%%
pd.Series(df["user.id"]).is_unique pd.Series(df["user.id"]).is_unique
#%% #%%
@@ -157,17 +192,42 @@ dfAll = df.merge(dfSenAll, how='left',on='user.username')
unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique() unique_usernames = dfAll.loc[dfAll['name'].isnull(), 'user.username'].unique()
print(unique_usernames) print(unique_usernames)
# senatorisakson was dropped, is ok # senatorisakson was dropped, is ok
#%%
# create covidtweets csv
dfCov = dfAll[dfAll['contains_counterKeyword']==False]
dfCov = dfCov[dfCov['contains_keyword']==True]
dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
#%%
# create column with tweet length
dfCov['tweetLen'] = dfCov['rawContent'].str.len().copy()
# reset df index and write to id column
dfCov.reset_index(drop=True, inplace=True)
#%% #%%
# Export to csv, sav and dta # Export to csv, sav and dta
dfAll.to_csv(senCSVcPath, encoding='utf-8') dfAll.to_csv(senCSVcPath, encoding='utf-8')
dfCov.to_csv(senCSVcCovPath, encoding='utf-8', index_label = 'id')
# pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb # pyreadstat.write_sav(df, senSAVcPath) # commented out because file generated is 11 gb
# ============================================================================= # =============================================================================
# dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True) # dfAll.rename(columns=lambda x: x.replace('.', '_'), inplace=True)
# dfAllStata = dfAll.rename(columns={'class':'class_'}) # dfAllStata = dfAll.rename(columns={'class':'class_'})
# dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'}) # dfAllStata.to_stata(senDTAcPath, version=119, convert_strl=['alt'], convert_dates={'date': 'td', 'user_created': 'td'})
# print(dfAllStata.columns) # print(dfAllStata.columns)
# ============================================================================= # ====================================================df.id.str.len().value_counts()
# =========================
# %% # %%
# Create training dataset
np.random.seed(seed);
dfTrain = pd.dfCov(np.random.rand(1800))
# %%
# Create training dataset
np.random.seed(seed);
dfTrain = dfCov.loc[np.random.choice(dfCov.index, 1800, replace=False)]
dfTrain = dfTrain[['tid', 'date', 'rawContent']]
dfTrain['topicCovid'] = True
dfTrain['fake'] = False
dfTrain.to_csv(senCSVcTrainPath, encoding='utf-8')

View File

@@ -66,7 +66,6 @@ which is the final output.
import os import os
import pandas as pd import pandas as pd
import glob import glob
import time
import sys import sys
from datetime import datetime from datetime import datetime
import concurrent.futures import concurrent.futures
@@ -149,10 +148,12 @@ tweetDFColumns = [
################## do NOT change anything below this line ################### ################## do NOT change anything below this line ###################
############################################################################# #############################################################################
## Import functions ## Import own functions
from funs.TimeSlice import * funs = wd+"funs"
from funs.ClearDupes import deDupe sys.path.insert(1, funs)
from funs.Scrape import scrapeTweets from TimeSlice import get_Tslices
from ClearDupes import deDupe
from Scrape import scrapeTweets
################### ###################
# Create logfile & log all outputs # Create logfile & log all outputs

144
createGraphs.py Normal file
View File

@@ -0,0 +1,144 @@
#%%
#!/usr/bin/env python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
import string
#%%
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 26 20:36:43 2023
@author: michael
"""
import pandas as pd
# import pyreadstat
# import numpy as np
###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
# Name of file that all senator data will be written to
senCSV = "SenatorsTweets-OnlyCov.csv" # SenatorsTweets-Final.csv SenatorsTweets-OnlyCov.csv
# Name of file that all senator data will be written to
senDataset = "senators-raw.csv"
# Name of new datafile generated
senCSVc = "SenatorsTweets-Final.csv"
senCSVcCov = "SenatorsTweets-OnlyCov.csv"
# Outfiles
wcAllTweetsF = "graphs/Wordcloud-All.png"
wcCovTweetsF = "graphs/Wordcloud-Cov.png"
TwCovTimeline = "graphs/Timeline.png"
# don't change this one
senCSVcPath = wd + ud + senCSVc
senCSVcCovPath = wd + ud + senCSVcCov
wcAllTweetsFPath = wd + ud + wcAllTweetsF
wcCovTweetsFPath = wd + ud + wcCovTweetsF
TwCovTimelinePath = wd + ud + TwCovTimeline
#%%
df = pd.read_csv(senCSVcPath, dtype=(object))
dfCov = pd.read_csv(senCSVcCovPath, dtype=(object))
#%%
df['cleanContent'] = df['rawContent'].apply(remove_URL)
df['cleanContent'] = df['cleanContent'].apply(remove_emoji)
df['cleanContent'] = df['cleanContent'].apply(remove_html)
df['cleanContent'] = df['cleanContent'].apply(remove_punct)
# create string with all cleaned tweets as text
str_alltweets = df['cleanContent'].astype(str).str.cat(sep=' ').casefold()
#%%
dfCov['cleanContent'] = dfCov['rawContent'].apply(remove_URL)
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_emoji)
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_html)
dfCov['cleanContent'] = dfCov['cleanContent'].apply(remove_punct)
# create string with all cleaned tweets as text
str_covtweets = dfCov['cleanContent'].astype(str).str.cat(sep=' ').casefold()
#%%
# replace single U and S characters
str_covtweets = str_covtweets.replace(' u ', ' ')
str_covtweets = str_covtweets.replace(' s ', ' ')
str_alltweets = str_alltweets.replace(' u ', ' ')
str_alltweets = str_alltweets.replace(' s ', ' ')
# %%
# create wordcloud alltweets
wcA = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
wcA.generate(str_alltweets)
#%%
# draw
plt.figure( figsize=(20,20))
plt.axis("off")
plt.imshow(wcA, interpolation="bilinear")
fig1 = plt.gcf()
plt.show()
fig1.savefig(wcAllTweetsFPath)
# %%
# create wordcloud covtweets
wcC = WordCloud(background_color="white", width=1000, height=1000, repeat=True)
wcC.generate(str_covtweets)
#%%
# draw
plt.figure( figsize=(20,20))
plt.axis("off")
plt.imshow(wcC, interpolation="bilinear")
fig2 = plt.gcf()
plt.show()
fig2.savefig(wcCovTweetsFPath)
# %%
# with open('test.txt', 'w') as f:
# f.write(str_covtweets)
# %%
dfT = pd.DataFrame()
dfT['date'] = df['date'].copy()
dfT['count'] = 1
dfCovT = pd.DataFrame()
dfCovT['date'] = dfCov['date'].copy()
dfCovT['count'] = 1
#%%
dfT['date'] = pd.to_datetime(dfT['date']).dt.strftime('%Y-%m-%d')
dfCovT['date'] = pd.to_datetime(dfCovT['date']).dt.strftime('%Y-%m-%d')
#%%
dfT = dfT.groupby('date').count().reset_index()
dfCovT = dfCovT.groupby('date').count().reset_index()
#%%
import matplotlib.dates as mdates
# n of tweets overall
my_dpi=300
plt.figure(figsize=(1000/my_dpi, 1500/my_dpi), dpi=my_dpi)
plt.style.use('seaborn-darkgrid')
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(dfCovT['date'], dfCovT['count'], marker='', color='tab:blue', linewidth=1, alpha=0.4)
ax.plot(dfT['date'], dfT['count'], marker='', color='tab:blue', linewidth=1, alpha=1)
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
ax.xaxis.set_minor_locator(mdates.MonthLocator())
fig.autofmt_xdate()
fig.savefig(TwCovTimelinePath)
# %%

View File

@@ -0,0 +1,23 @@
opioid
gun violence
gun-violence
CHD
Coronary heart disease
addiction
tobacco
vaping
e-cigarette
shooting
indigenous women
overdose
meth
cocaine
separated children
separating children
separating families
Muslim travel ban
flu-season
flu season
Soleimani
Muslim Ban
USMCA trade deal

View File

@@ -0,0 +1,23 @@
meth
gun violence
flu season
vaping
chd
addiction
indigenous women
separating children
tobacco
e-cigarette
muslim ban
soleimani
cocaine
separating families
muslim travel ban
usmca trade deal
shooting
overdose
separated children
coronary heart disease
gun-violence
opioid
flu-season

View File

@@ -18,44 +18,43 @@ socialdistancing
wear a mask wear a mask
lockdown lockdown
covd covd
Coronavirus coronavirus
Koronavirus koronavirus
Corona corona
CDC cdc
Wuhancoronavirus wuhancoronavirus
Wuhanlockdown wuhanlockdown
Ncov ncov
Wuhan wuhan
N95 n95
Kungflu kungflu
Epidemic epidemic
outbreak outbreak
Sinophobia sinophobia
China
covid-19 covid-19
corona virus corona virus
covid covid
covid19 covid19
sars-cov-2 sars-cov-2
COVIDー19 covidー19
COVD covd
pandemic pandemic
coronapocalypse coronapocalypse
canceleverything canceleverything
Coronials coronials
SocialDistancingNow socialdistancingnow
Social Distancing social distancing
SocialDistancing socialdistancing
panicbuy panicbuy
panic buy panic buy
panicbuying panicbuying
panic buying panic buying
14DayQuarantine 14dayquarantine
DuringMy14DayQuarantine duringmy14dayquarantine
panic shop panic shop
panic shopping panic shopping
panicshop panicshop
InMyQuarantineSurvivalKit inmyquarantinesurvivalkit
panic-buy panic-buy
panic-shop panic-shop
coronakindness coronakindness
@@ -65,7 +64,7 @@ chinesevirus
stayhomechallenge stayhomechallenge
stay home challenge stay home challenge
sflockdown sflockdown
DontBeASpreader dontbeaspreader
lockdown lockdown
lock down lock down
shelteringinplace shelteringinplace
@@ -79,13 +78,13 @@ flatten the curve
china virus china virus
chinavirus chinavirus
quarentinelife quarentinelife
PPEshortage ppeshortage
saferathome saferathome
stayathome stayathome
stay at home stay at home
stay home stay home
stayhome stayhome
GetMePPE getmeppe
covidiot covidiot
epitwitter epitwitter
pandemie pandemie
@@ -93,7 +92,7 @@ wear a mask
wearamask wearamask
kung flu kung flu
covididiot covididiot
COVID__19 covid__19
omicron omicron
variant variant
vaccine vaccine
@@ -139,9 +138,7 @@ work from home
workfromhome workfromhome
working from home working from home
workingfromhome workingfromhome
ppe
n95 n95
ppe
n95 n95
covidiots covidiots
covidiots covidiots

8
data/OUT/.gitignore vendored Normal file
View File

@@ -0,0 +1,8 @@
/ALL-SENATORS-TWEETS.csv
/Pretest-Prep.csv
/Pretest-Results.csv
/Pretest-SENATORS-TWEETS.csv
/SenatorsTweets-Final.csv
/SenatorsTweets-OnlyCov.csv
/Tweets-Classified-Prep.csv
/Tweets-Stub.csv

3
data/OUT/graphs/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
/Timeline.png
/Wordcloud-All.png
/Wordcloud-Cov.png

89
funs/CleanTweets.py Normal file
View File

@@ -0,0 +1,89 @@
import re
import string
def preprocess_roberta(text): # https://huggingface.co/cardiffnlp/twitter-roberta-base-sep2022
preprocessed_text = []
for t in text.split():
if len(t) > 1:
t = '@user' if t[0] == '@' and t.count('@') == 1 else t
t = 'http' if t.startswith('http') else t
preprocessed_text.append(t)
return ' '.join(preprocessed_text)
def remove_URL(text):
try:
url = re.compile(r'https?://\S+|www\.\S+')
except: print(text)
return url.sub(r'', text)
def remove_emoji(text):
emoji_pattern = re.compile(
'['
u'\U0001F600-\U0001F64F' # emoticons
u'\U0001F300-\U0001F5FF' # symbols & pictographs
u'\U0001F680-\U0001F6FF' # transport & map symbols
u'\U0001F1E0-\U0001F1FF' # flags (iOS)
u'\U00002702-\U000027B0'
u'\U000024C2-\U0001F251'
']+',
flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
def remove_html(text):
html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
return re.sub(html, '', text)
def remove_punct(text):
table = str.maketrans('', '', string.punctuation)
return text.translate(table)
def remove_nonascii(text):
return re.sub(r'[^\x00-\x7F]+', '', text)
def remove_spec(text):
text = re.sub(r'&amp;?', r'and', text)
text = re.sub(r'&lt;', r'<', text)
return re.sub(r'&gt;', r'>', text)
def remove_spaces(text): # also new line chars and to lower case
text = re.sub(r'&lt;', r'<', text)
text = " ".join(text.splitlines()) # remove newline characters
text = text.lower()
text = text.strip()
return re.sub(r'\s{2,}', ' ', text)
def remove_retw(text):
text = re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+', '', text)
return re.sub(r'@[\S]+', '', text)
def preprocess_text(text):
text = remove_URL(text)
text = remove_emoji(text)
text = remove_html(text)
text = remove_punct(text)
text = remove_nonascii(text)
text = remove_spec(text)
text = remove_spaces(text)
text = remove_retw(text)
return text
def preprocess_text_series(series):
series = series.apply(remove_URL)
series = series.apply(remove_emoji)
series = series.apply(remove_html)
series = series.apply(remove_punct)
series = series.apply(remove_nonascii)
series = series.apply(remove_spec)
series = series.apply(remove_spaces)
series = series.apply(remove_retw)
return series
# Check all functions:
input_text = """
Check out this amazing website: https://www.example.com! 😃
<html>This is an HTML tag.</html>
RT @user123: Just received a package from @companyXYZ. It's awesome! 📦
This is a test text with lots of punctuations!!! Can't wait to see more...
"""
processed_text = preprocess_text(input_text)
# print(processed_text)

View File

@@ -0,0 +1,7 @@
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
1,0.39025546515679493,0.40877932761593355,0.9103260869565217,0:10:21,0:00:40
2,0.3057803610952067,0.3502063500978377,0.9103260869565217,0:10:53,0:00:43
3,0.17910970049364833,0.27903796154904464,0.9375,0:10:30,0:00:38
4,0.09279396105943587,0.41342766528301267,0.904891304347826,0:11:03,0:00:43
5,0.06132459050129317,0.4468563502887264,0.9239130434782609,0:12:07,0:00:44
6,0.04195396880810895,0.4350045176675928,0.9266304347826086,0:11:21,0:00:40
1 epoch Training Loss Valid. Loss Valid. Accur. Training Time Validation Time
2 1 0.39025546515679493 0.40877932761593355 0.9103260869565217 0:10:21 0:00:40
3 2 0.3057803610952067 0.3502063500978377 0.9103260869565217 0:10:53 0:00:43
4 3 0.17910970049364833 0.27903796154904464 0.9375 0:10:30 0:00:38
5 4 0.09279396105943587 0.41342766528301267 0.904891304347826 0:11:03 0:00:43
6 5 0.06132459050129317 0.4468563502887264 0.9239130434782609 0:12:07 0:00:44
7 6 0.04195396880810895 0.4350045176675928 0.9266304347826086 0:11:21 0:00:40

View File

@@ -0,0 +1,7 @@
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
1,0.6699380816093513,0.6216431430407933,0.6964285714285714,0:01:03,0:00:02
2,0.6649796058024678,0.621175297669002,0.6964285714285714,0:01:03,0:00:01
3,0.642247314964022,0.6377243144171578,0.6964285714285714,0:01:05,0:00:02
4,0.6300328698541436,0.6038827853543418,0.6964285714285714,0:01:04,0:00:02
5,0.544977219509227,0.6619421115943364,0.625,0:01:02,0:00:02
6,0.3951783587357828,0.48477122613361906,0.7857142857142857,0:01:05,0:00:01
1 epoch Training Loss Valid. Loss Valid. Accur. Training Time Validation Time
2 1 0.6699380816093513 0.6216431430407933 0.6964285714285714 0:01:03 0:00:02
3 2 0.6649796058024678 0.621175297669002 0.6964285714285714 0:01:03 0:00:01
4 3 0.642247314964022 0.6377243144171578 0.6964285714285714 0:01:05 0:00:02
5 4 0.6300328698541436 0.6038827853543418 0.6964285714285714 0:01:04 0:00:02
6 5 0.544977219509227 0.6619421115943364 0.625 0:01:02 0:00:02
7 6 0.3951783587357828 0.48477122613361906 0.7857142857142857 0:01:05 0:00:01

View File

@@ -0,0 +1,7 @@
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
1,0.5610552686641376,0.4569096086310089,0.9116022099447514,0:37:20,0:00:31
2,0.43647773836513126,0.5441495520680196,0.9005524861878453,0:36:14,0:00:30
3,0.288773139899344,0.43471020716692715,0.9392265193370166,0:36:10,0:00:29
4,0.19330878817686287,0.4555162174395349,0.9281767955801105,0:36:17,0:00:30
5,0.09109889855869348,0.5060150003684702,0.9281767955801105,0:36:13,0:00:30
6,0.05734757932275739,0.6043995772428771,0.9226519337016574,0:36:11,0:00:31
1 epoch Training Loss Valid. Loss Valid. Accur. Training Time Validation Time
2 1 0.5610552686641376 0.4569096086310089 0.9116022099447514 0:37:20 0:00:31
3 2 0.43647773836513126 0.5441495520680196 0.9005524861878453 0:36:14 0:00:30
4 3 0.288773139899344 0.43471020716692715 0.9392265193370166 0:36:10 0:00:29
5 4 0.19330878817686287 0.4555162174395349 0.9281767955801105 0:36:17 0:00:30
6 5 0.09109889855869348 0.5060150003684702 0.9281767955801105 0:36:13 0:00:30
7 6 0.05734757932275739 0.6043995772428771 0.9226519337016574 0:36:11 0:00:31

View File

@@ -0,0 +1,7 @@
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
1,0.21681843259712502,0.0005426188472483773,1.0,0:01:13,0:00:02
2,0.00016121647037353423,0.0002873415878639207,1.0,0:01:12,0:00:02
3,6.752021149355535e-05,0.00024319994372490328,1.0,0:01:12,0:00:02
4,4.7950222591787355e-05,0.00022139604243420763,1.0,0:01:13,0:00:02
5,3.99839740138679e-05,0.00021302999493855168,1.0,0:01:11,0:00:02
6,3.5356899656214995e-05,0.00020912183117616223,1.0,0:01:13,0:00:02
1 epoch Training Loss Valid. Loss Valid. Accur. Training Time Validation Time
2 1 0.21681843259712502 0.0005426188472483773 1.0 0:01:13 0:00:02
3 2 0.00016121647037353423 0.0002873415878639207 1.0 0:01:12 0:00:02
4 3 6.752021149355535e-05 0.00024319994372490328 1.0 0:01:12 0:00:02
5 4 4.7950222591787355e-05 0.00022139604243420763 1.0 0:01:13 0:00:02
6 5 3.99839740138679e-05 0.00021302999493855168 1.0 0:01:11 0:00:02
7 6 3.5356899656214995e-05 0.00020912183117616223 1.0 0:01:13 0:00:02

View File

@@ -1,12 +1,9 @@
import re
import string
import numpy as np
import pandas as pd import pandas as pd
from datetime import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset from transformers.pipelines.pt_utils import KeyDataset
#%% #%%
# prepare # prepare
# install xformers (pip install xformers) for better performance # install xformers (pip install xformers) for better performance
@@ -38,7 +35,6 @@ senCSVPretest = "Pretest.csv"
senCSVPretestPrep = "Pretest-Prep.csv" senCSVPretestPrep = "Pretest-Prep.csv"
senCSVPretestResult = "Pretest-Results.csv" senCSVPretestResult = "Pretest-Results.csv"
# don't change this one # don't change this one
senCSVPath = wd + ud + senCSV senCSVPath = wd + ud + senCSV
senCSVcPath = wd + ud + senCSVc senCSVcPath = wd + ud + senCSVc
@@ -48,6 +44,11 @@ senCSVcPretestResultPath = wd + ud + senCSVPretestResult
preTestIDsFakePath = wd + di + preTestIDsFake preTestIDsFakePath = wd + di + preTestIDsFake
preTestIDsNotPath = wd + di + preTestIDsNot preTestIDsNotPath = wd + di + preTestIDsNot
import sys
funs = wd+"funs"
sys.path.insert(1, funs)
import CleanTweets
# List of IDs to select # List of IDs to select
# Read the IDs from a file # Read the IDs from a file
preTestIDsFakeL = [] preTestIDsFakeL = []
@@ -82,41 +83,12 @@ model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth") tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
def remove_URL(text):
url = re.compile(r'https?://\S+|www\.\S+')
return url.sub(r'', text)
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(CleanTweets.preprocess_text)
def remove_emoji(text):
emoji_pattern = re.compile(
'['
u'\U0001F600-\U0001F64F' # emoticons
u'\U0001F300-\U0001F5FF' # symbols & pictographs
u'\U0001F680-\U0001F6FF' # transport & map symbols
u'\U0001F1E0-\U0001F1FF' # flags (iOS)
u'\U00002702-\U000027B0'
u'\U000024C2-\U0001F251'
']+',
flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
def remove_html(text):
html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
return re.sub(html, '', text)
def remove_punct(text):
table = str.maketrans('', '', string.punctuation)
return text.translate(table)
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower())
#%% #%%
timeStart = datetime.now() # start counting execution time
max_length = 128 max_length = 128
dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids']) dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
#train.rename(columns={'target': 'labels'}, inplace=True) #train.rename(columns={'target': 'labels'}, inplace=True)
@@ -149,6 +121,14 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun
dfPreTest['output_label'] = output_labels dfPreTest['output_label'] = output_labels
dfPreTest['output_score'] = output_score dfPreTest['output_score'] = output_score
timeEnd = datetime.now()
timeTotal = timeEnd - timeStart
timePerTweet = timeTotal / 96
print(f"Total classification execution time: {timeTotal} seconds")
print(f"Time per tweet classification: {timePerTweet}")
print(f"Estimated time for full classification of tweets: {timePerTweet*50183}")
# %% # %%
dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8') dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8')

55
profiler.py Normal file
View File

@@ -0,0 +1,55 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 8 14:49:02 2023
@author: michael
"""
import pandas as pd
import pandas_profiling as pp
import numpy
###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
# Name of file that all senator data will be written to
senCSV = "ALL-SENATORS-TWEETS.csv"
# Name of file that all senator data will be written to
senDataset = "senators-raw.csv"
# Name of new datafile generated
senCSVc = "SenatorsTweets-Final"
senCSVcCov = "SenatorsTweets-OnlyCov"
# don't change this one
senCSVPath = wd + ud + senCSV
senCSVcPath = wd + ud + senCSVc + ".csv"
senCSVcCovPath = wd + ud + senCSVcCov + ".csv"
senSAVcPath = wd + ud + senCSV + ".sav"
senDTAcPath = wd + ud + senCSV + ".dta"
senDatasetPath = wd + di + senDataset
# forming dataframe and printing
df = pd.read_csv(senCSVPath, dtype=(object))
# forming ProfileReport and save
# as output.html file
profileAll = pp.ProfileReport(df, minimal=True)
profileAll.to_file("data/OUT/profiles/AllTweets.html")
df = pd.read_csv(senCSVcCovPath, dtype=(object))
profileAll = pp.ProfileReport(df, minimal=True)
profileAll.to_file("data/OUT/profiles/CovTweets.html")

35
repairmystupidity.py Normal file
View File

@@ -0,0 +1,35 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 14 20:47:22 2023
@author: michael
"""
import pandas as pd
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
falsch = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct.csv"
richtig = wd + ud + "SenatorsTweets-Training.csv"
correct = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct2.csv"
# Name of new datafile generated
senCSVprep = "SenatorsTweets-Training_WORKING-COPY-prepared"
# don't change this one
falsch = pd.read_csv(falsch, dtype=(object), sep=";")
richtig = pd.read_csv(richtig, dtype=(object))
df = pd.merge(falsch,richtig[['tid','rawContent', 'date']],on='tid', how='left')
df.drop(columns=['rawContent_x', 'date_x'], inplace=True)
df.rename(columns={'tid_y':'tid', 'rawContent_y':'rawContent', 'date_y':'date'}, inplace=True)
df = df[['tid','date','topicCovid','fake','rawContent','Unnamed: 6']]
df.rename(columns={'Unnamed: 6':'comment'}, inplace=True)
df.to_csv(correct, encoding='utf-8', sep=";")

613
trainFake.py Normal file
View File

@@ -0,0 +1,613 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 12 12:25:18 2023
@author: michael
"""
#from datasets import load_dataset
#from transformers import Trainer
#from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch
import numpy as np
from sklearn.model_selection import train_test_split # pip install scikit-learn
import pandas as pd
## Uses snippets from this guide:
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
import sys
funs = wd+"funs"
sys.path.insert(1, funs)
import CleanTweets
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
# Training CSV dataset
twtCSV = "SenatorsTweets-Training_WORKING-COPY-correct2"
twtCSVtrainCovClass = "SenatorsTweets-train-CovClassification"
twtCSVtrainFakeClass = "SenatorsTweets-train-FakeClassification"
statsTrainingTopicClass = "statsTopicClassification-"
# don't change this one
twtCSVPath = wd + ud + twtCSV + ".csv"
twtCSVtrainCovClassPath = wd + ud + twtCSVtrainCovClass + ".csv"
twtCSVtrainFakeClassPath = wd + ud + twtCSVtrainFakeClass + ".csv"
statsTrainingTopicClassPath = wd + ud + statsTrainingTopicClass
twtCSVtrainCovClassPathTrain = wd + ud + twtCSVtrainCovClass + "TRAIN.csv"
twtCSVtrainFakeClassPathTrain = wd + ud + twtCSVtrainFakeClass + "TRAIN.csv"
twtTSVtrainCovClassPathTrain = wd + ud + "cov-train.tsv"
twtTSVtrainFakeClassPathTrain = wd + ud + "fake-train.tsv"
twtTSVtrainCovClassPathEval = wd + ud + "cov-eval.tsv"
twtTSVtrainFakeClassPathEval = wd + ud + "fake-eval.tsv"
seed = 12355
# Model paths
modCovClassPath = wd + "models/CovClass/"
modFakeClassPath = wd + "models/FakeClass/"
model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' # accuracy 69
#model_name = 'justinqbui/bertweet-covid19-base-uncased-pretraining-covid-vaccine-tweets' #48
#model_name = "cardiffnlp/tweet-topic-latest-multi"
model_name = "bvrau/covid-twitter-bert-v2-struth"
#model_name = "cardiffnlp/roberta-base-tweet-topic-single-all"
model_fake_name = 'bvrau/covid-twitter-bert-v2-struth'
# More models for fake detection:
# https://huggingface.co/justinqbui/bertweet-covid-vaccine-tweets-finetuned
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 64 # max token sentence length
#%%
# Create training and testing dataset
dfTest = pd.read_csv(twtCSVPath, dtype=(object), delimiter=";")
#dfTest = dfTest[:-900] # remove last 800 rows
#dfTest = dfTest.iloc[:,:-3] # remove last 800 rows
dfTest['text'] = dfTest['rawContent'].apply(CleanTweets.preprocess_roberta)
dfTest.drop(columns=['rawContent'], inplace=True)
# Only keep tweets that are longer than 3 words
dfTest['tweet_proc_length'] = [len(text.split(' ')) for text in dfTest['text']]
dfTest['tweet_proc_length'].value_counts()
dfTest = dfTest[dfTest['tweet_proc_length']>3]
dfTest = dfTest.drop_duplicates(subset=['text'])
dfTest = dfTest.drop(columns=['date', 'Unnamed: 0'])
# Create datasets for each classification
dfCovClass = dfTest
dfFakeClass = dfTest
dfCovClass = dfCovClass.drop(columns=['fake']) # fake column not neeeded in covid topic classification data
dfFakeClass = dfFakeClass[dfFakeClass['topicCovid']=='True'].drop(columns=['topicCovid']) # topicCovid column not neeeded in covid topic classification data
#type_map = {'Covid tweet': 'covid tweets', 'Noncovid tweet': 'noncovid tweet'}
dfCovClass.rename(index = str, columns={'topicCovid': 'labels', 'tid': 'id'}, inplace = True)
dfCovClass.labels = dfCovClass.labels.replace({"True": 'Covid', "False": 'NonCovid'})
#type_map = {'fake news tweet': 'fake news tweet', 'non-fake-news-tweet': 'non-fake-news-tweet'}
dfFakeClass.rename(index = str, columns={'fake': 'labels', 'tid': 'id'}, inplace = True)
#%%
# Tokenize tweets
dfCovClass = dfCovClass[dfCovClass['labels'].notna()]
dfFakeClass['labels'].replace({'Check': '','check': '', 'FALSE':''}, inplace=True)
dfFakeClass = dfFakeClass[dfFakeClass['labels'].notna()]
dfCovClass['input_ids'] = dfCovClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
dfFakeClass['input_ids'] = dfFakeClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
def encode_labels(label):
if label == 'Covid':
return 1
elif label == 'NonCovid':
return 0
elif label == 'False':
return 1
elif label == 'True':
return 0
return 0
dfCovClass['labels_encoded'] = dfCovClass['labels'].apply(encode_labels)
dfFakeClass['labels_encoded'] = dfFakeClass['labels'].apply(encode_labels)
dfFakeClass = dfFakeClass[dfFakeClass['labels']!=""]
#dfFakeClass = dfFakeClass[(dfFakeClass['labels']=="Fake") | (dfFakeClass['labels']=="True")]
# get n of classes
print("# of Non-Covid tweets (coded 0):")
print(dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
# 62 non-covid tweets, disproportionate sample for training has to be 124 tweets
print("# of Fake-news tweets (coded 1):")
print(dfFakeClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
# create disproportionate sample - 50/50 of both
#dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique()
#dfCovClass = dfCovClass.groupby('labels_encoded', group_keys=False).apply(lambda x: x.sample(164, random_state=seed))
# after a lot of tests, it seems that a sample in which non-fake news tweets are overrepresented leads to better results.
# because of this, performance limitations and time constraints, group 1 (covid topic) will be overrepresented (twice as many), which still doesn't reflect the real preoportions ~10/1
'''dfCovClassa = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(1).sample(frac=1, replace=True).reset_index()
dfCovClassb = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(0).sample(frac=1, replace=True).reset_index()
dfCovClassab= pd.concat([dfCovClassa,dfCovClassb])
dfCovClassab.reset_index(inplace=True)
dfCovClass_train, dfCovClass_test = train_test_split(dfCovClassab, test_size=0.1, random_state=seed, stratify=dfCovClassab['labels_encoded'])
'''
# create training and validation samples
dfFakeClass_train, dfFakeClass_test = train_test_split(dfFakeClass, test_size=0.1, random_state=seed, stratify=dfFakeClass['labels_encoded'])
# reset index and drop unnecessary columns
dfFakeClass_train.reset_index(drop=True, inplace=True)
dfFakeClass_train.drop(inplace=True, columns=['tweet_proc_length'])
dfFakeClass_train.groupby('labels_encoded', group_keys=False)['id'].nunique()
dfFakeClass_test.reset_index(drop=True, inplace=True)
dfFakeClass_test.drop(inplace=True, columns=['tweet_proc_length'])
dfFakeClass_test.groupby('labels_encoded', group_keys=False)['id'].nunique()
# save dfs as csvs and tsvs, for training and validation
# covid classification datafiles
# rows 0-41 = noncovid, 42-81 covid, therfore:
#dfCovClass = dfCovClass.drop(columns=['tweet_proc_length'])
#dfCovClass.reset_index(inplace=True, drop=True)
#dfCovClass.loc[np.r_[0:31, 42:71], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPathTrain, encoding='utf-8', sep=";")
#dfCovClass.loc[np.r_[0:31, 42:72], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathTrain, encoding='utf-8', sep="\t")
#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPath, encoding='utf-8', sep=";")
#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathEval, encoding='utf-8', sep="\t")
# fake news classification datafiles
#dfFakeClass = dfFakeClass.drop(columns=['tweet_proc_length'])
#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPathTrain, encoding='utf-8', sep=";")
#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathTrain, encoding='utf-8', sep="\t")
#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPath, encoding='utf-8', sep=";")
#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathEval, encoding='utf-8', sep="\t")
#%%
# Prepare trainer
#from transformers import TrainingArguments
#training_args = TrainingArguments(
# report_to = 'wandb',
# output_dir=wd+'results', # output directory/
# overwrite_output_dir = True,
# num_train_epochs=6, # total number of training epochs
# per_device_train_batch_size=8, # batch size per device during training
# per_device_eval_batch_size=16, # batch size for evaluation
# learning_rate=2e-5,
# warmup_steps=1000, # number of warmup steps for learning rate scheduler
# weight_decay=0.01, # strength of weight decay
# logging_dir='./logs3', # directory for storing logs
# logging_steps=1000,
# evaluation_strategy="epoch",
# save_strategy="epoch",
# load_best_model_at_end=True
#)
tokenizer = AutoTokenizer.from_pretrained(model_name)
from transformers import BertForSequenceClassification, AdamW#, BertConfig
#from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
"""
train_dataset = load_dataset('csv', data_files={'train': twtCSVtrainCovClassPathTrain}, encoding = "utf-8")
train_dataset = train_dataset['train']
eval_dataset = load_dataset('csv', data_files={'test': twtCSVtrainCovClassPath}, encoding = "utf-8")
eval_dataset = eval_dataset['test']
"""
batch_size = 1
from torch.utils.data import Dataset
class PandasDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_length):
self.dataframe = dataframe
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.dataframe)
def __getitem__(self, index):
row = self.dataframe.iloc[index]
text = row['text']
labels = row['labels_encoded']
encoded = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True)
input_ids = torch.tensor(encoded['input_ids'])
attention_mask = torch.tensor(encoded['attention_mask'])
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': torch.tensor(labels) # Assuming labels are already encoded
}
train_dataset = PandasDataset(dfFakeClass_train, tokenizer, max_length)
train_dataloader = DataLoader(
train_dataset,
sampler=RandomSampler(train_dataset),
batch_size=batch_size
)
eval_dataset = PandasDataset(dfFakeClass_test, tokenizer, max_length)
validation_dataloader = DataLoader(
eval_dataset,
sampler=SequentialSampler(eval_dataset),
batch_size=batch_size
)
for idx, batch in enumerate(train_dataloader):
print('Batch index: ', idx)
print('Batch size: ', batch['input_ids'].size()) # Access 'input_ids' field
print('Batch label: ', batch['labels']) # Access 'labels' field
break
model = BertForSequenceClassification.from_pretrained(
model_name,
num_labels = 2, # The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)
#trainer = Trainer(
# model=model, # the instantiated 🤗 Transformers model to be trained
# args=training_args, # training arguments, defined above
# train_dataset=train_dataset, # training dataset
# eval_dataset=eval_dataset # evaluation dataset
#)
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
)
from transformers import get_linear_schedule_with_warmup
# Number of training epochs. The BERT authors recommend between 2 and 4.
# We chose to run for 6
epochs = 6
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
import time
import datetime
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round((elapsed)))
# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
import random
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
# Set the seed value all over the place to make this reproducible.
seed_val = 12355
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
#model.cuda()
# If not...
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
device = torch.device("cpu")
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
#%%
# Start training
# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []
# Measure the total training time for the whole run.
total_t0 = time.time()
# For each epoch...
for epoch_i in range(0, epochs):
# ========================================
# Training
# ========================================
# Perform one full pass over the training set.
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('{:>5,} steps per batch will be calculated.'.format(len(train_dataloader)))
print('Training...')
# Measure how long the training epoch takes.
t0 = time.time()
model.to(device)
# Reset the total loss for this epoch.
total_train_loss = 0
# Put the model into training mode. Don't be mislead--the call to
# `train` just changes the *mode*, it doesn't *perform* the training.
# `dropout` and `batchnorm` layers behave differently during training
# vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
model.train()
# For each batch of training data...
for step, batch in enumerate(train_dataloader):
# Progress update every 10 batches.
if step % 10 == 0 and not step == 0:
# Calculate elapsed time in minutes.
elapsed = format_time(time.time() - t0)
# Report progress.
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
# Unpack this training batch from our dataloader.
#
# As we unpack the batch, we'll also copy each tensor to the GPU using the
# `to` method.
#
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
print("Batch keys:", batch.keys())
b_input_ids = batch['input_ids'].to(device)
b_input_mask = batch['attention_mask'].to(device)
b_labels = batch['labels'].to(device)
# Always clear any previously calculated gradients before performing a
# backward pass. PyTorch doesn't do this automatically because
# accumulating the gradients is "convenient while training RNNs".
# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
model.zero_grad()
# Perform a forward pass (evaluate the model on this training batch).
# The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
# It returns different numbers of parameters depending on what arguments
# arge given and what flags are set. For our useage here, it returns
# the loss (because we provided labels) and the "logits"--the model
# outputs prior to activation.
output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
loss = output[0]
logits = output[1]
# Accumulate the training loss over all of the batches so that we can
# calculate the average loss at the end. `loss` is a Tensor containing a
# single value; the `.item()` function just returns the Python value
# from the tensor.
total_train_loss += loss.item()
# Perform a backward pass to calculate the gradients.
loss.backward()
# Clip the norm of the gradients to 1.0.
# This is to help prevent the "exploding gradients" problem.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and take a step using the computed gradient.
# The optimizer dictates the "update rule"--how the parameters are
# modified based on their gradients, the learning rate, etc.
optimizer.step()
# Update the learning rate.
scheduler.step()
# Calculate the average loss over all of the batches.
avg_train_loss = total_train_loss / len(train_dataloader)
# Measure how long this epoch took.
training_time = format_time(time.time() - t0)
print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epcoh took: {:}".format(training_time))
# ========================================
# Validation
# ========================================
# After the completion of each training epoch, measure our performance on
# our validation set.
print("")
print("Running Validation...")
t0 = time.time()
# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()
# Tracking variables
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0
# Evaluate data for one epoch
for batch in validation_dataloader:
# Unpack this training batch from our dataloader.
#
# As we unpack the batch, we'll also copy each tensor to the GPU using
# the `to` method.
#
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch['input_ids'].to(device)
b_input_mask = batch['attention_mask'].to(device)
b_labels = batch['labels'].to(device)
# Tell pytorch not to bother with constructing the compute graph during
# the forward pass, since this is only needed for backprop (training).
with torch.no_grad():
# Forward pass, calculate logit predictions.
# token_type_ids is the same as the "segment ids", which
# differentiates sentence 1 and 2 in 2-sentence tasks.
# The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
# Get the "logits" output by the model. The "logits" are the output
# values prior to applying an activation function like the softmax.
output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
loss = output[0]
logits = output[1]
# Accumulate the validation loss.
total_eval_loss += loss.item()
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
# Calculate the accuracy for this batch of test sentences, and
# accumulate it over all batches.
total_eval_accuracy += flat_accuracy(logits, label_ids)
# Report the final accuracy for this validation run.
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
print(" Accuracy: {0:.2f}".format(avg_val_accuracy))
# Calculate the average loss over all of the batches.
avg_val_loss = total_eval_loss / len(validation_dataloader)
# Measure how long the validation run took.
validation_time = format_time(time.time() - t0)
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
print(" Validation took: {:}".format(validation_time))
# Record all statistics from this epoch.
training_stats.append(
{
'epoch': epoch_i + 1,
'Training Loss': avg_train_loss,
'Valid. Loss': avg_val_loss,
'Valid. Accur.': avg_val_accuracy,
'Training Time': training_time,
'Validation Time': validation_time
}
)
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
params = list(model.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
import os
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
from datetime import datetime as dt
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
now = dt.now().strftime(fTimeFormat)
output_dir = modFakeClassPath + now + "/"
# Create output directory if needed
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print("Saving model to %s" % output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))
import pandas as pd
# Display floats with two decimal places.
pd.set_option('display.precision', 2)
# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)
# Use the 'epoch' as the row index.# Good practice: save your training arguments together with the trained model
df_stats = df_stats.set_index('epoch')
# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
# Display the table.
df_stats
df_stats.to_csv(output_dir + now + ".csv")

607
trainTopic.py Normal file
View File

@@ -0,0 +1,607 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 12 12:25:18 2023
@author: michael
"""
#from datasets import load_dataset
#from transformers import Trainer
#from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch
import numpy as np
from sklearn.model_selection import train_test_split # pip install scikit-learn
import pandas as pd
## Uses snippets from this guide:
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
import sys
funs = wd+"funs"
sys.path.insert(1, funs)
import CleanTweets
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
# Training CSV dataset
twtCSV = "SenatorsTweets-Training_WORKING-COPY-correct2"
twtCSVtrainCovClass = "SenatorsTweets-train-CovClassification"
twtCSVtrainFakeClass = "SenatorsTweets-train-FakeClassification"
statsTrainingTopicClass = "statsTopicClassification-"
# don't change this one
twtCSVPath = wd + ud + twtCSV + ".csv"
twtCSVtrainCovClassPath = wd + ud + twtCSVtrainCovClass + ".csv"
twtCSVtrainFakeClassPath = wd + ud + twtCSVtrainFakeClass + ".csv"
statsTrainingTopicClassPath = wd + ud + statsTrainingTopicClass
twtCSVtrainCovClassPathTrain = wd + ud + twtCSVtrainCovClass + "TRAIN.csv"
twtCSVtrainFakeClassPathTrain = wd + ud + twtCSVtrainFakeClass + "TRAIN.csv"
twtTSVtrainCovClassPathTrain = wd + ud + "cov-train.tsv"
twtTSVtrainFakeClassPathTrain = wd + ud + "fake-train.tsv"
twtTSVtrainCovClassPathEval = wd + ud + "cov-eval.tsv"
twtTSVtrainFakeClassPathEval = wd + ud + "fake-eval.tsv"
seed = 12355
# Model paths
modCovClassPath = wd + "models/CovClass/"
modFakeClassPath = wd + "models/FakeClass/"
model_name = "bvrau/covid-twitter-bert-v2-struth"
model_fake_name = 'bvrau/covid-twitter-bert-v2-struth'
# More models for fake detection:
# https://huggingface.co/justinqbui/bertweet-covid-vaccine-tweets-finetuned
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 64 # max token sentence length
#%%
# Create training and testing dataset
dfTest = pd.read_csv(twtCSVPath, dtype=(object), delimiter=";")
#dfTest = dfTest[:-900] # remove last 800 rows
#dfTest = dfTest.iloc[:,:-3] # remove last 800 rows
dfTest['text'] = dfTest['rawContent'].apply(CleanTweets.preprocess_roberta)
dfTest.drop(columns=['rawContent'], inplace=True)
# Only keep tweets that are longer than 3 words
dfTest['tweet_proc_length'] = [len(text.split(' ')) for text in dfTest['text']]
dfTest['tweet_proc_length'].value_counts()
dfTest = dfTest[dfTest['tweet_proc_length']>3]
dfTest = dfTest.drop_duplicates(subset=['text'])
dfTest = dfTest.drop(columns=['date', 'Unnamed: 0'])
# Create datasets for each classification
dfCovClass = dfTest
dfFakeClass = dfTest
dfCovClass = dfCovClass.drop(columns=['fake']) # fake column not neeeded in covid topic classification data
dfFakeClass = dfFakeClass[dfFakeClass['topicCovid']=='True'].drop(columns=['topicCovid']) # topicCovid column not neeeded in covid topic classification data
#type_map = {'Covid tweet': 'covid tweets', 'Noncovid tweet': 'noncovid tweet'}
dfCovClass.rename(index = str, columns={'topicCovid': 'labels', 'tid': 'id'}, inplace = True)
dfCovClass.labels = dfCovClass.labels.replace({"True": 'Covid', "False": 'NonCovid'})
#type_map = {'fake news tweet': 'fake news tweet', 'non-fake-news-tweet': 'non-fake-news-tweet'}
dfFakeClass.rename(index = str, columns={'fake': 'labels', 'tid': 'id'}, inplace = True)
dfFakeClass.labels = dfFakeClass.labels.replace({"True": 'Fake', "False": 'True'})
#%%
# Tokenize tweets
dfCovClass = dfCovClass[dfCovClass['labels'].notna()]
dfFakeClass = dfFakeClass[dfFakeClass['labels'].notna()]
dfCovClass['input_ids'] = dfCovClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
dfFakeClass['input_ids'] = dfFakeClass['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
def encode_labels(label):
if label == 'Covid':
return 1
elif label == 'NonCovid':
return 0
elif label == 'Fake':
return 1
elif label == 'True':
return 0
return 0
dfCovClass['labels_encoded'] = dfCovClass['labels'].apply(encode_labels)
dfFakeClass['labels_encoded'] = dfFakeClass['labels'].apply(encode_labels)
# get n of classes
print("# of Non-Covid tweets (coded 0):")
print(dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
# 62 non-covid tweets, disproportionate sample for training has to be 124 tweets
print("# of Fake-news tweets (coded 1):")
print(dfFakeClass.groupby('labels_encoded', group_keys=False)['id'].nunique())
# create disproportionate sample - 50/50 of both
#dfCovClass.groupby('labels_encoded', group_keys=False)['id'].nunique()
#dfCovClass = dfCovClass.groupby('labels_encoded', group_keys=False).apply(lambda x: x.sample(164, random_state=seed))
# after a lot of tests, it seems that a sample in which non-fake news tweets are overrepresented leads to better results.
# because of this, performance limitations and time constraints, group 1 (covid topic) will be overrepresented (twice as many), which still doesn't reflect the real preoportions ~10/1
'''dfCovClassa = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(1).sample(frac=1, replace=True).reset_index()
dfCovClassb = dfCovClass.groupby('labels_encoded', group_keys=False).get_group(0).sample(frac=1, replace=True).reset_index()
dfCovClassab= pd.concat([dfCovClassa,dfCovClassb])
dfCovClassab.reset_index(inplace=True)
dfCovClass_train, dfCovClass_test = train_test_split(dfCovClassab, test_size=0.1, random_state=seed, stratify=dfCovClassab['labels_encoded'])
'''
# create training and validation samples
dfCovClass_train, dfCovClass_test = train_test_split(dfCovClass, test_size=0.1, random_state=seed, stratify=dfCovClass['labels_encoded'])
# reset index and drop unnecessary columns
dfCovClass_train.reset_index(drop=True, inplace=True)
dfCovClass_train.drop(inplace=True, columns=['tweet_proc_length'])
dfCovClass_train.groupby('labels_encoded', group_keys=False)['id'].nunique()
dfCovClass_test.reset_index(drop=True, inplace=True)
dfCovClass_test.drop(inplace=True, columns=['tweet_proc_length'])
dfCovClass_test.groupby('labels_encoded', group_keys=False)['id'].nunique()
# save dfs as csvs and tsvs, for training and validation
# covid classification datafiles
# rows 0-41 = noncovid, 42-81 covid, therfore:
#dfCovClass = dfCovClass.drop(columns=['tweet_proc_length'])
#dfCovClass.reset_index(inplace=True, drop=True)
#dfCovClass.loc[np.r_[0:31, 42:71], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPathTrain, encoding='utf-8', sep=";")
#dfCovClass.loc[np.r_[0:31, 42:72], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathTrain, encoding='utf-8', sep="\t")
#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtCSVtrainCovClassPath, encoding='utf-8', sep=";")
#dfCovClass.loc[np.r_[31:41, 72:81], :].reset_index(drop=True).to_csv(twtTSVtrainCovClassPathEval, encoding='utf-8', sep="\t")
# fake news classification datafiles
#dfFakeClass = dfFakeClass.drop(columns=['tweet_proc_length'])
#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPathTrain, encoding='utf-8', sep=";")
#dfFakeClass[200:1000].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathTrain, encoding='utf-8', sep="\t")
#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtCSVtrainFakeClassPath, encoding='utf-8', sep=";")
#dfFakeClass[0:199].reset_index(drop=True).to_csv(twtTSVtrainFakeClassPathEval, encoding='utf-8', sep="\t")
#%%
# Prepare trainer
#from transformers import TrainingArguments
#training_args = TrainingArguments(
# report_to = 'wandb',
# output_dir=wd+'results', # output directory/
# overwrite_output_dir = True,
# num_train_epochs=6, # total number of training epochs
# per_device_train_batch_size=8, # batch size per device during training
# per_device_eval_batch_size=16, # batch size for evaluation
# learning_rate=2e-5,
# warmup_steps=1000, # number of warmup steps for learning rate scheduler
# weight_decay=0.01, # strength of weight decay
# logging_dir='./logs3', # directory for storing logs
# logging_steps=1000,
# evaluation_strategy="epoch",
# save_strategy="epoch",
# load_best_model_at_end=True
#)
tokenizer = AutoTokenizer.from_pretrained(model_name)
from transformers import BertForSequenceClassification, AdamW#, BertConfig
#from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
"""
train_dataset = load_dataset('csv', data_files={'train': twtCSVtrainCovClassPathTrain}, encoding = "utf-8")
train_dataset = train_dataset['train']
eval_dataset = load_dataset('csv', data_files={'test': twtCSVtrainCovClassPath}, encoding = "utf-8")
eval_dataset = eval_dataset['test']
"""
batch_size = 1
from torch.utils.data import Dataset
class PandasDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_length):
self.dataframe = dataframe
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.dataframe)
def __getitem__(self, index):
row = self.dataframe.iloc[index]
text = row['text']
labels = row['labels_encoded']
encoded = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True)
input_ids = torch.tensor(encoded['input_ids'])
attention_mask = torch.tensor(encoded['attention_mask'])
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': torch.tensor(labels) # Assuming labels are already encoded
}
train_dataset = PandasDataset(dfCovClass_train, tokenizer, max_length)
train_dataloader = DataLoader(
train_dataset,
sampler=RandomSampler(train_dataset),
batch_size=batch_size
)
eval_dataset = PandasDataset(dfCovClass_test, tokenizer, max_length)
validation_dataloader = DataLoader(
eval_dataset,
sampler=SequentialSampler(eval_dataset),
batch_size=batch_size
)
for idx, batch in enumerate(train_dataloader):
print('Batch index: ', idx)
print('Batch size: ', batch['input_ids'].size()) # Access 'input_ids' field
print('Batch label: ', batch['labels']) # Access 'labels' field
break
model = BertForSequenceClassification.from_pretrained(
model_name,
num_labels = 2, # The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)
#trainer = Trainer(
# model=model, # the instantiated 🤗 Transformers model to be trained
# args=training_args, # training arguments, defined above
# train_dataset=train_dataset, # training dataset
# eval_dataset=eval_dataset # evaluation dataset
#)
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
)
from transformers import get_linear_schedule_with_warmup
# Number of training epochs. The BERT authors recommend between 2 and 4.
# We chose to run for 6
epochs = 6
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
import time
import datetime
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round((elapsed)))
# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
import random
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
# Set the seed value all over the place to make this reproducible.
seed_val = 12355
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
#model.cuda()
# If not...
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
device = torch.device("cpu")
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
#%%
# Start training
# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []
# Measure the total training time for the whole run.
total_t0 = time.time()
# For each epoch...
for epoch_i in range(0, epochs):
# ========================================
# Training
# ========================================
# Perform one full pass over the training set.
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('{:>5,} steps per batch will be calculated.'.format(len(train_dataloader)))
print('Training...')
# Measure how long the training epoch takes.
t0 = time.time()
model.to(device)
# Reset the total loss for this epoch.
total_train_loss = 0
# Put the model into training mode. Don't be mislead--the call to
# `train` just changes the *mode*, it doesn't *perform* the training.
# `dropout` and `batchnorm` layers behave differently during training
# vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
model.train()
# For each batch of training data...
for step, batch in enumerate(train_dataloader):
# Progress update every 10 batches.
if step % 10 == 0 and not step == 0:
# Calculate elapsed time in minutes.
elapsed = format_time(time.time() - t0)
# Report progress.
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
# Unpack this training batch from our dataloader.
#
# As we unpack the batch, we'll also copy each tensor to the GPU using the
# `to` method.
#
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
print("Batch keys:", batch.keys())
b_input_ids = batch['input_ids'].to(device)
b_input_mask = batch['attention_mask'].to(device)
b_labels = batch['labels'].to(device)
# Always clear any previously calculated gradients before performing a
# backward pass. PyTorch doesn't do this automatically because
# accumulating the gradients is "convenient while training RNNs".
# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
model.zero_grad()
# Perform a forward pass (evaluate the model on this training batch).
# The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
# It returns different numbers of parameters depending on what arguments
# arge given and what flags are set. For our useage here, it returns
# the loss (because we provided labels) and the "logits"--the model
# outputs prior to activation.
output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
loss = output[0]
logits = output[1]
# Accumulate the training loss over all of the batches so that we can
# calculate the average loss at the end. `loss` is a Tensor containing a
# single value; the `.item()` function just returns the Python value
# from the tensor.
total_train_loss += loss.item()
# Perform a backward pass to calculate the gradients.
loss.backward()
# Clip the norm of the gradients to 1.0.
# This is to help prevent the "exploding gradients" problem.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and take a step using the computed gradient.
# The optimizer dictates the "update rule"--how the parameters are
# modified based on their gradients, the learning rate, etc.
optimizer.step()
# Update the learning rate.
scheduler.step()
# Calculate the average loss over all of the batches.
avg_train_loss = total_train_loss / len(train_dataloader)
# Measure how long this epoch took.
training_time = format_time(time.time() - t0)
print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epcoh took: {:}".format(training_time))
# ========================================
# Validation
# ========================================
# After the completion of each training epoch, measure our performance on
# our validation set.
print("")
print("Running Validation...")
t0 = time.time()
# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()
# Tracking variables
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0
# Evaluate data for one epoch
for batch in validation_dataloader:
# Unpack this training batch from our dataloader.
#
# As we unpack the batch, we'll also copy each tensor to the GPU using
# the `to` method.
#
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch['input_ids'].to(device)
b_input_mask = batch['attention_mask'].to(device)
b_labels = batch['labels'].to(device)
# Tell pytorch not to bother with constructing the compute graph during
# the forward pass, since this is only needed for backprop (training).
with torch.no_grad():
# Forward pass, calculate logit predictions.
# token_type_ids is the same as the "segment ids", which
# differentiates sentence 1 and 2 in 2-sentence tasks.
# The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
# Get the "logits" output by the model. The "logits" are the output
# values prior to applying an activation function like the softmax.
output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
loss = output[0]
logits = output[1]
# Accumulate the validation loss.
total_eval_loss += loss.item()
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
# Calculate the accuracy for this batch of test sentences, and
# accumulate it over all batches.
total_eval_accuracy += flat_accuracy(logits, label_ids)
# Report the final accuracy for this validation run.
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
print(" Accuracy: {0:.2f}".format(avg_val_accuracy))
# Calculate the average loss over all of the batches.
avg_val_loss = total_eval_loss / len(validation_dataloader)
# Measure how long the validation run took.
validation_time = format_time(time.time() - t0)
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
print(" Validation took: {:}".format(validation_time))
# Record all statistics from this epoch.
training_stats.append(
{
'epoch': epoch_i + 1,
'Training Loss': avg_train_loss,
'Valid. Loss': avg_val_loss,
'Valid. Accur.': avg_val_accuracy,
'Training Time': training_time,
'Validation Time': validation_time
}
)
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
params = list(model.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
import os
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
from datetime import datetime as dt
fTimeFormat = "%Y-%m-%d_%H-%M-%S"
now = dt.now().strftime(fTimeFormat)
output_dir = modCovClassPath + now + "/"
# Create output directory if needed
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print("Saving model to %s" % output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))
import pandas as pd
# Display floats with two decimal places.
pd.set_option('display.precision', 2)
# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)
# Use the 'epoch' as the row index.# Good practice: save your training arguments together with the trained model
df_stats = df_stats.set_index('epoch')
# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
# Display the table.
df_stats
df_stats.to_csv(output_dir + now + ".csv")