14 Commits

17 changed files with 274 additions and 6590 deletions

2
.gitignore vendored
View File

@@ -2,6 +2,8 @@
**/*lock* **/*lock*
**/*-slice*.csv **/*-slice*.csv
**/*.zip **/*.zip
**/*.html
**/*.htm
/ALL-SENATORS-LONG.csv /ALL-SENATORS-LONG.csv
/ALL-SENATORS.csv /ALL-SENATORS.csv
/collect2.py /collect2.py

View File

@@ -1,113 +0,0 @@
import re
import string
import numpy as np
import pandas as pd
from datetime import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
#%%
# prepare & define paths
# install xformers (pip install xformers) for better performance
###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
# Name of file that all senator data will be written to
senCSV = "SenatorsTweets-OnlyCov.csv"
# Name of Classify datafile
senCSVClassifiedPrep = "Tweets-Classified-Prep.csv"
senCSVClassifiedResult = "Tweets-Classified-Results.csv"
# don't change this one
senCSVPath = wd + ud + senCSV
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
#%%
# get datafra,e
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
# dataframe from csv
dfClassify['fake'] = False
#%%
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
# HowTo:
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(remove_URL)
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_emoji)
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_html)
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_punct)
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(lambda x: x.lower())
#%%
# remove empty rows
dfClassify.cleanContent.replace('',np.nan,inplace=True)
dfClassify.dropna(subset=['cleanContent'], inplace=True)
#%%
timeStart = datetime.now() # start counting execution time
max_length = 128
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
#train.rename(columns={'target': 'labels'}, inplace=True)
#train.head()
# %%
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
#%%
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
# %%from datetime import datetime
#from tqdm.auto import tqdm
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
# print(out)
#%%
output_labels = []
output_score = []
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
output_labels.append(out['label'])
output_score.append(out['score'])
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
# Exactly the same output as before, but the content are passed
# as batches to the model
# %%
dfClassify['output_label'] = output_labels
dfClassify['output_score'] = output_score
timeEnd = datetime.now()
timeTotal = timeEnd - timeStart
timePerTweet = timeTotal / 96
print(f"Total classification execution time: {timeTotal} seconds")
print(f"Time per tweet classification: {timePerTweet}")
# %%
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
# %%

View File

@@ -1,12 +1,9 @@
import re
import string
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset from transformers.pipelines.pt_utils import KeyDataset
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
#%% #%%
@@ -26,11 +23,11 @@ di = "data/IN/"
ud = "data/OUT/" ud = "data/OUT/"
# Name of file that all senator data will be written to # Name of file that all senator data will be written to
senCSV = "SenatorsTweets-OnlyCov.csv" senCSV = "Tweets-Classified-Topic-Results.csv"
# Name of Classify datafile # Name of Classify datafile
senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv" senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv"
senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv" senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv"
# don't change this one # don't change this one
senCSVPath = wd + ud + senCSV senCSVPath = wd + ud + senCSV
@@ -46,6 +43,16 @@ import CleanTweets
#%% #%%
# get datafra,e # get datafra,e
dfClassify = pd.read_csv(senCSVPath, dtype=(object)) dfClassify = pd.read_csv(senCSVPath, dtype=(object))
def encode_labels(label):
if label == 'True':
return 'False'
elif label == 'False':
return 'True'
return 0
dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
dfClassify.to_csv("/home/michael/Documents/PS/Data/collectTweets/data/OUT/Tweets-Classified-Topic-Results.csv", encoding='utf-8')
dfClassify = dfClassify[dfClassify['output_label_topicCov']=='True']
# dataframe from csv # dataframe from csv
dfClassify['fake'] = False dfClassify['fake'] = False
@@ -56,9 +63,9 @@ dfClassify['fake'] = False
# HowTo: # HowTo:
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification # https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline # https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth") pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth") model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth") tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
@@ -100,8 +107,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun
# Exactly the same output as before, but the content are passed # Exactly the same output as before, but the content are passed
# as batches to the model # as batches to the model
# %% # %%
dfClassify['output_label'] = output_labels dfClassify['output_label_fake'] = output_labels
dfClassify['output_score'] = output_score dfClassify['output_score_fake'] = output_score
timeEnd = datetime.now() timeEnd = datetime.now()
timeTotal = timeEnd - timeStart timeTotal = timeEnd - timeStart

View File

@@ -1,12 +1,9 @@
import re
import string
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset from transformers.pipelines.pt_utils import KeyDataset
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
#%% #%%
@@ -99,8 +96,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun
# Exactly the same output as before, but the content are passed # Exactly the same output as before, but the content are passed
# as batches to the model # as batches to the model
# %% # %%
dfClassify['output_label'] = output_labels dfClassify['output_label_topicCov'] = output_labels
dfClassify['output_score'] = output_score dfClassify['output_score_topicCov'] = output_score
timeEnd = datetime.now() timeEnd = datetime.now()
timeTotal = timeEnd - timeStart timeTotal = timeEnd - timeStart
@@ -113,3 +110,14 @@ print(f"Time per tweet classification: {timePerTweet}")
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8') dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
# %% # %%
## corrections
def encode_labels(label):
if label == 'real':
return 'True'
elif label == 'fake':
return 'False'
return 0
dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
#still wrong, will be corrected in ClassificationFake.py

132
README.md
View File

@@ -1,7 +1,131 @@
# How to use # Requirements
Execute collect.py to scrape tweets and generate the ´ALL-SENATORS-TWEETS.csv´. - python 3.10+
- snscrape 0.6.2.20230321+ (see git repo in this folder)
- transformers 4.31.0
- numpy 1.23.5
- pandas 2.0.3
- scikit-learn 1.3.0
- torch 2.0.1
Execute collectSenData.py to scrape senator data and generate ´ALL-SENATORS.csv´. # About
All new files will be written to ´data/OUT/´. Necessary data has to be located in ´data/IN/´ This collection of scripts scrapes tweets of US-senators in the time from 2020-01-01T00:00:00Z to 2023-01-03T00:00:00Z, scrapes account data of the senators, prepares the tweets for the training of a NLP-model, trains two models to (1) classify the tweets topic as covid or non-covid and (2) the tweets as either "fake news" tweets or "non-fake news" tweets.
Training only works with a prepared dataset in which the tweets are pre classified.
More info in the comments of the scripts.
Due to time constraints, most of the code is procedurally coded and ugly but effective.
# How to
Tested on Ubuntu 22.04.
If needed, the virual environment can be exported and send to you.
All files in the folder data/in have to exist in order to execute the scripts.
Execute in the following order:
01 collect.py (see more for further info on scraping)
02 collectSenData.py
03 cleanTweets
04 preTestClassification.py
05 trainTopic.py
06 trainFake.py
07 ClassificationFake.py
08 ClassificationTopic.py
# Files & Folders
Datafiles are not included in the repository but can be found in the full package that can be downloaded from [here](https://ncloud.mischbeck.de/s/T4QcMDSfYSkadYC) (password protected).
```
├── data
│   ├── IN
│   │   ├── counterKeywordsFinal.txt
│   │   ├── counterKeywords.txt
│   │   ├── keywords-raw.txt
│   │   ├── keywords.txt
│   │   ├── own_keywords.txt
│   │   ├── pretest-tweets_fake.txt contains tweet ids for pretest
│   │   ├── pretest-tweets_not_fake.txt contains tweet ids for pretest
│   │   └── senators-raw.csv senator datafile
│   ├── OUT
│   │   ├── ALL-SENATORS-TWEETS.csv
│   │   ├── graphs
│   │   │   ├── Timeline.png
│   │   │   ├── Wordcloud-All.png
│   │   │   └── Wordcloud-Cov.png
│   │   ├── Pretest-Prep.csv
│   │   ├── Pretest-Results.csv
│   │   ├── Pretest-SENATORS-TWEETS.csv
│   │   ├── profiles dataset profiles
│   │   │   ├── AllTweets.html
│   │   │   └── CovTweets.html
│   │   ├── SenatorsTweets-Final.csv
│   │   ├── SenatorsTweets-OnlyCov.csv
│   │   ├── SenatorsTweets-train-CovClassification.csv
│   │   ├── SenatorsTweets-train-CovClassificationTRAIN.csv
│   │   ├── SenatorsTweets-train-CovClassification.tsv
│   │   ├── SenatorsTweets-train-FakeClassification.csv
│   │   ├── SenatorsTweets-train-FakeClassificationTRAIN.csv
│   │   ├── SenatorsTweets-train-FakeClassification.tsv
│   │   ├── SenatorsTweets-Training.csv
│   │   ├── SenatorsTweets-Training_WORKING-COPY.csv
│   │   ├── topClass-PRETEST-Prep.csv
│   │   ├── topClass-PRETEST-Results.csv
│   │   ├── Tweets-All-slices.zip
│   │   ├── Tweets-Classified-Fake-Prep.csv
│   │   ├── Tweets-Classified-Fake-Results.csv
│   │   ├── Tweets-Classified-Prep.csv
│   │   ├── Tweets-Classified-Topic-Prep.csv
│   │   ├── Tweets-Classified-Topic-Results.csv
│   │   └── Tweets-Stub.csv
├── funs
│   ├── CleanTweets.py 2023-01-03T00:00:00Z multiple functions to clean tweet contents for NLN-processing
│   ├── ClearDupes.py function for deletion of duplicate keywords
│   ├── __init__.py
│   ├── Scrape.py scraper functions to be used for multiprocessing
│   └── TimeSlice.py time slice script to slice the time span in 24 slices, speeds up scraping through multiprocessing
├── log logs of the scraping process
│   ├── log_2023-06-23_21-06-10_err.log
│   ├── log_2023-06-23_21-06-10.log
│   └── log_2023-06-23_21-06-10_missing.log
├── models
│   ├── CovClass Covid tweet classification model
│   │   └── 2023-08-15_05-56-50
│   │   ├── 2023-08-15_05-56-50.csv training output
│   │   ├── config.json
│   │   ├── pytorch_model.bin
│   │   ├── special_tokens_map.json
│   │   ├── tokenizer_config.json
│   │   ├── tokenizer.json
│   │   └── vocab.txt
│   └── FakeClass Fake tweet classification model
│   └── 2023-08-15_14-35-43
│   ├── 2023-08-15_14-35-43.csv training output
│   ├── config.json
│   ├── pytorch_model.bin
│   ├── special_tokens_map.json
│   ├── tokenizer_config.json
│   ├── tokenizer.json
│   └── vocab.txt
├── snscrape contains snscrape 0.6.2.20230321+ git repo
├── ClassificationFake.py classifies tweets as fake or non-fake, saves:
│ Tweets-Classified-Fake-Prep.csv - prepared training dataset
│ Tweets-Classified-Fake-Results.csv - Tweets-Classified-Topic-Results.csv with cov classification results
├── ClassificationTopic.py classifies tweet topic, saves:
│ Tweets-Classified-Topic-Prep.csv - prepared training dataset
│ Tweets-Classified-Topic-Results.csv - SenatorsTweets-OnlyCov.csv with cov classification results
├── cleanTweets.py Curates keywordlists
│ Merges senator and tweet datasets
│ Creates multiple datasets:
│ SenatorsTweets-Final.csv - all tweets with keyword columns
│ SenatorsTweets-OnlyCov.csv - only covid tweets, filtered by keywordlist
│ SenatorsTweets-Training.csv - training dataset, containing ~1800 randomly selected tweets from SenatorsTweets-OnlyCov.csv
├── collect.py scrapes tweets, saves to ALL-SENATORS-TWEETS.csv
├── collectSenData.py scrapes senator account data, saves to ALL-SENATORS.csv
├── createGraphs.py creates wordcloud & timeline graphs
├── preTestClassification.py pretest script that uses bvrau/covid-twitter-bert-v2-struth to analyze 100 preclassified tweets
├── profiler.py creates dataset profiles
├── README.md readme
├── trainFake.py training script for the fake tweet classification model
└── trainTopic.py training script for the tweet topic classification model
```

View File

@@ -1,129 +0,0 @@
import re
import string
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
#%%
# prepare
# install xformers (pip install xformers) for better performance
###################
# Setup directories
# WD Michael
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# WD Server
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
# Name of file that all senator data will be written to
senCSV = "ALL-SENATORS-TWEETS.csv"
# Name of new datafile generated
senCSVc = "Tweets-Stub.csv"
# Name of pretest files
preTestIDsFake = "pretest-tweets_fake.txt"
preTestIDsNot = "pretest-tweets_not_fake.txt"
# Name of pretest datafile
senCSVPretest = "Pretest.csv"
senCSVPretestPrep = "Pretest-Prep.csv"
senCSVPretestResult = "Pretest-Results.csv"
# don't change this one
senCSVPath = wd + ud + senCSV
senCSVcPath = wd + ud + senCSVc
senCSVcPretestPath = wd + ud + senCSVPretest
senCSVcPretestPrepPath = wd + ud + senCSVPretestPrep
senCSVcPretestResultPath = wd + ud + senCSVPretestResult
preTestIDsFakePath = wd + di + preTestIDsFake
preTestIDsNotPath = wd + di + preTestIDsNot
# List of IDs to select
# Read the IDs from a file
preTestIDsFakeL = []
preTestIDsNotL = []
with open(preTestIDsFakePath, "r") as file:
lines = file.readlines()
for line in lines:
tid = line.strip() # Remove the newline character
preTestIDsFakeL.append(tid)
with open(preTestIDsNotPath, "r") as file:
lines = file.readlines()
for line in lines:
tid = line.strip() # Remove the newline character
preTestIDsNotL.append(tid)
# Select rows based on the IDs
df = pd.read_csv(senCSVPath, dtype=(object))
#%%
# Create pretest dataframe
dfPreTest = df[df['id'].isin(preTestIDsFakeL)].copy()
dfPreTest['fake'] = True
dfPreTest = pd.concat([dfPreTest, df[df['id'].isin(preTestIDsNotL)]], ignore_index=True)
dfPreTest['fake'] = dfPreTest['fake'].fillna(False)
#%%
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
# HowTo:
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower())
#%%
max_length = 128
dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
#train.rename(columns={'target': 'labels'}, inplace=True)
#train.head()
# %%
dfPreTest.to_csv(senCSVcPretestPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
#%%
dataset = load_dataset("csv", data_files=senCSVcPretestPrepPath)
# %%
results = pipe(KeyDataset(dataset, "text"))
# %%
#from tqdm.auto import tqdm
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
# print(out)
#%%
output_labels = []
output_score = []
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
output_labels.append(out['label'])
output_score.append(out['score'])
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
# Exactly the same output as before, but the content are passed
# as batches to the model
# %%
dfPreTest['output_label'] = output_labels
dfPreTest['output_score'] = output_score
# %%
dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8')
# %%

View File

@@ -9,7 +9,8 @@ Created on Mon Jun 26 20:36:43 2023
import pandas as pd import pandas as pd
# import pyreadstat # import pyreadstat
import numpy as np import numpy as np
from funs.ClearDupes import deDupe import sys
# Seet for training dataset generation # Seet for training dataset generation
seed = 86431891 seed = 86431891
@@ -49,6 +50,11 @@ senDatasetPath = wd + di + senDataset
df = pd.read_csv(senCSVPath, dtype=(object)) df = pd.read_csv(senCSVPath, dtype=(object))
## Import own functions
funs = wd+"funs"
sys.path.insert(1, funs)
from ClearDupes import deDupe
mixed_columns = df.columns[df.nunique() != len(df)] mixed_columns = df.columns[df.nunique() != len(df)]
print(mixed_columns) print(mixed_columns)

View File

@@ -66,7 +66,6 @@ which is the final output.
import os import os
import pandas as pd import pandas as pd
import glob import glob
import time
import sys import sys
from datetime import datetime from datetime import datetime
import concurrent.futures import concurrent.futures
@@ -149,10 +148,12 @@ tweetDFColumns = [
################## do NOT change anything below this line ################### ################## do NOT change anything below this line ###################
############################################################################# #############################################################################
## Import functions ## Import own functions
from funs.TimeSlice import * funs = wd+"funs"
from funs.ClearDupes import deDupe sys.path.insert(1, funs)
from funs.Scrape import scrapeTweets from TimeSlice import get_Tslices
from ClearDupes import deDupe
from Scrape import scrapeTweets
################### ###################
# Create logfile & log all outputs # Create logfile & log all outputs

View File

@@ -0,0 +1,23 @@
meth
gun violence
flu season
vaping
chd
addiction
indigenous women
separating children
tobacco
e-cigarette
muslim ban
soleimani
cocaine
separating families
muslim travel ban
usmca trade deal
shooting
overdose
separated children
coronary heart disease
gun-violence
opioid
flu-season

View File

@@ -18,44 +18,43 @@ socialdistancing
wear a mask wear a mask
lockdown lockdown
covd covd
Coronavirus coronavirus
Koronavirus koronavirus
Corona corona
CDC cdc
Wuhancoronavirus wuhancoronavirus
Wuhanlockdown wuhanlockdown
Ncov ncov
Wuhan wuhan
N95 n95
Kungflu kungflu
Epidemic epidemic
outbreak outbreak
Sinophobia sinophobia
China
covid-19 covid-19
corona virus corona virus
covid covid
covid19 covid19
sars-cov-2 sars-cov-2
COVIDー19 covidー19
COVD covd
pandemic pandemic
coronapocalypse coronapocalypse
canceleverything canceleverything
Coronials coronials
SocialDistancingNow socialdistancingnow
Social Distancing social distancing
SocialDistancing socialdistancing
panicbuy panicbuy
panic buy panic buy
panicbuying panicbuying
panic buying panic buying
14DayQuarantine 14dayquarantine
DuringMy14DayQuarantine duringmy14dayquarantine
panic shop panic shop
panic shopping panic shopping
panicshop panicshop
InMyQuarantineSurvivalKit inmyquarantinesurvivalkit
panic-buy panic-buy
panic-shop panic-shop
coronakindness coronakindness
@@ -65,7 +64,7 @@ chinesevirus
stayhomechallenge stayhomechallenge
stay home challenge stay home challenge
sflockdown sflockdown
DontBeASpreader dontbeaspreader
lockdown lockdown
lock down lock down
shelteringinplace shelteringinplace
@@ -79,13 +78,13 @@ flatten the curve
china virus china virus
chinavirus chinavirus
quarentinelife quarentinelife
PPEshortage ppeshortage
saferathome saferathome
stayathome stayathome
stay at home stay at home
stay home stay home
stayhome stayhome
GetMePPE getmeppe
covidiot covidiot
epitwitter epitwitter
pandemie pandemie
@@ -93,7 +92,7 @@ wear a mask
wearamask wearamask
kung flu kung flu
covididiot covididiot
COVID__19 covid__19
omicron omicron
variant variant
vaccine vaccine
@@ -139,9 +138,7 @@ work from home
workfromhome workfromhome
working from home working from home
workingfromhome workingfromhome
ppe
n95 n95
ppe
n95 n95
covidiots covidiots
covidiots covidiots

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,7 @@
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
1,0.39025546515679493,0.40877932761593355,0.9103260869565217,0:10:21,0:00:40
2,0.3057803610952067,0.3502063500978377,0.9103260869565217,0:10:53,0:00:43
3,0.17910970049364833,0.27903796154904464,0.9375,0:10:30,0:00:38
4,0.09279396105943587,0.41342766528301267,0.904891304347826,0:11:03,0:00:43
5,0.06132459050129317,0.4468563502887264,0.9239130434782609,0:12:07,0:00:44
6,0.04195396880810895,0.4350045176675928,0.9266304347826086,0:11:21,0:00:40
1 epoch Training Loss Valid. Loss Valid. Accur. Training Time Validation Time
2 1 0.39025546515679493 0.40877932761593355 0.9103260869565217 0:10:21 0:00:40
3 2 0.3057803610952067 0.3502063500978377 0.9103260869565217 0:10:53 0:00:43
4 3 0.17910970049364833 0.27903796154904464 0.9375 0:10:30 0:00:38
5 4 0.09279396105943587 0.41342766528301267 0.904891304347826 0:11:03 0:00:43
6 5 0.06132459050129317 0.4468563502887264 0.9239130434782609 0:12:07 0:00:44
7 6 0.04195396880810895 0.4350045176675928 0.9266304347826086 0:11:21 0:00:40

View File

@@ -1,13 +1,8 @@
import re
import string
import numpy as np
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset from transformers.pipelines.pt_utils import KeyDataset
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
#%% #%%
# prepare # prepare
@@ -40,7 +35,6 @@ senCSVPretest = "Pretest.csv"
senCSVPretestPrep = "Pretest-Prep.csv" senCSVPretestPrep = "Pretest-Prep.csv"
senCSVPretestResult = "Pretest-Results.csv" senCSVPretestResult = "Pretest-Results.csv"
# don't change this one # don't change this one
senCSVPath = wd + ud + senCSV senCSVPath = wd + ud + senCSV
senCSVcPath = wd + ud + senCSVc senCSVcPath = wd + ud + senCSVc
@@ -50,6 +44,11 @@ senCSVcPretestResultPath = wd + ud + senCSVPretestResult
preTestIDsFakePath = wd + di + preTestIDsFake preTestIDsFakePath = wd + di + preTestIDsFake
preTestIDsNotPath = wd + di + preTestIDsNot preTestIDsNotPath = wd + di + preTestIDsNot
import sys
funs = wd+"funs"
sys.path.insert(1, funs)
import CleanTweets
# List of IDs to select # List of IDs to select
# Read the IDs from a file # Read the IDs from a file
preTestIDsFakeL = [] preTestIDsFakeL = []
@@ -85,11 +84,7 @@ tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL) dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(CleanTweets.preprocess_text)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct)
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower())
#%% #%%
timeStart = datetime.now() # start counting execution time timeStart = datetime.now() # start counting execution time

35
repairmystupidity.py Normal file
View File

@@ -0,0 +1,35 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 14 20:47:22 2023
@author: michael
"""
import pandas as pd
wd = "/home/michael/Documents/PS/Data/collectTweets/"
# datafile input directory
di = "data/IN/"
# Tweet-datafile output directory
ud = "data/OUT/"
falsch = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct.csv"
richtig = wd + ud + "SenatorsTweets-Training.csv"
correct = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct2.csv"
# Name of new datafile generated
senCSVprep = "SenatorsTweets-Training_WORKING-COPY-prepared"
# don't change this one
falsch = pd.read_csv(falsch, dtype=(object), sep=";")
richtig = pd.read_csv(richtig, dtype=(object))
df = pd.merge(falsch,richtig[['tid','rawContent', 'date']],on='tid', how='left')
df.drop(columns=['rawContent_x', 'date_x'], inplace=True)
df.rename(columns={'tid_y':'tid', 'rawContent_y':'rawContent', 'date_y':'date'}, inplace=True)
df = df[['tid','date','topicCovid','fake','rawContent','Unnamed: 6']]
df.rename(columns={'Unnamed: 6':'comment'}, inplace=True)
df.to_csv(correct, encoding='utf-8', sep=";")

View File

@@ -15,10 +15,8 @@ from sklearn.model_selection import train_test_split # pip install scikit-learn
import pandas as pd import pandas as pd
## Follow these two guides: ## Uses snippets from this guide:
# best one https://mccormickml.com/2019/07/22/BERT-fine-tuning/ # https://mccormickml.com/2019/07/22/BERT-fine-tuning/
# https://xiangyutang2.github.io/tweet-classification/
# https://medium.com/mlearning-ai/fine-tuning-bert-for-tweets-classification-ft-hugging-face-8afebadd5dbf
################### ###################
# Setup directories # Setup directories

View File

@@ -15,10 +15,8 @@ from sklearn.model_selection import train_test_split # pip install scikit-learn
import pandas as pd import pandas as pd
## Follow these two guides: ## Uses snippets from this guide:
# best one https://mccormickml.com/2019/07/22/BERT-fine-tuning/ # https://mccormickml.com/2019/07/22/BERT-fine-tuning/
# https://xiangyutang2.github.io/tweet-classification/
# https://medium.com/mlearning-ai/fine-tuning-bert-for-tweets-classification-ft-hugging-face-8afebadd5dbf
################### ###################
# Setup directories # Setup directories
@@ -65,11 +63,7 @@ seed = 12355
modCovClassPath = wd + "models/CovClass/" modCovClassPath = wd + "models/CovClass/"
modFakeClassPath = wd + "models/FakeClass/" modFakeClassPath = wd + "models/FakeClass/"
model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' # accuracy 69
#model_name = 'justinqbui/bertweet-covid19-base-uncased-pretraining-covid-vaccine-tweets' #48
#model_name = "cardiffnlp/tweet-topic-latest-multi"
model_name = "bvrau/covid-twitter-bert-v2-struth" model_name = "bvrau/covid-twitter-bert-v2-struth"
#model_name = "cardiffnlp/roberta-base-tweet-topic-single-all"
model_fake_name = 'bvrau/covid-twitter-bert-v2-struth' model_fake_name = 'bvrau/covid-twitter-bert-v2-struth'
# More models for fake detection: # More models for fake detection: