Compare commits
14 Commits
2e067b6a64
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 89b4755c65 | |||
| 01e58b1b99 | |||
| d0fcefedf4 | |||
| 71cf907249 | |||
| a9018fedee | |||
| d94a93295f | |||
| 80b63b39df | |||
| d8136909c8 | |||
| 1c6d9d5415 | |||
| 4e08cde317 | |||
| 2535683cdc | |||
| 8f744a08be | |||
| df5fd51a5f | |||
| 3d4f559d2d |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -2,6 +2,8 @@
|
|||||||
**/*lock*
|
**/*lock*
|
||||||
**/*-slice*.csv
|
**/*-slice*.csv
|
||||||
**/*.zip
|
**/*.zip
|
||||||
|
**/*.html
|
||||||
|
**/*.htm
|
||||||
/ALL-SENATORS-LONG.csv
|
/ALL-SENATORS-LONG.csv
|
||||||
/ALL-SENATORS.csv
|
/ALL-SENATORS.csv
|
||||||
/collect2.py
|
/collect2.py
|
||||||
|
|||||||
@@ -1,113 +0,0 @@
|
|||||||
import re
|
|
||||||
import string
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from datetime import datetime
|
|
||||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
|
||||||
from datasets import load_dataset
|
|
||||||
from transformers.pipelines.pt_utils import KeyDataset
|
|
||||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
|
||||||
|
|
||||||
|
|
||||||
#%%
|
|
||||||
# prepare & define paths
|
|
||||||
# install xformers (pip install xformers) for better performance
|
|
||||||
###################
|
|
||||||
# Setup directories
|
|
||||||
# WD Michael
|
|
||||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
|
||||||
# WD Server
|
|
||||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
|
||||||
|
|
||||||
# datafile input directory
|
|
||||||
di = "data/IN/"
|
|
||||||
|
|
||||||
# Tweet-datafile output directory
|
|
||||||
ud = "data/OUT/"
|
|
||||||
|
|
||||||
# Name of file that all senator data will be written to
|
|
||||||
senCSV = "SenatorsTweets-OnlyCov.csv"
|
|
||||||
|
|
||||||
# Name of Classify datafile
|
|
||||||
senCSVClassifiedPrep = "Tweets-Classified-Prep.csv"
|
|
||||||
senCSVClassifiedResult = "Tweets-Classified-Results.csv"
|
|
||||||
|
|
||||||
# don't change this one
|
|
||||||
senCSVPath = wd + ud + senCSV
|
|
||||||
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
|
|
||||||
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
|
|
||||||
|
|
||||||
#%%
|
|
||||||
# get datafra,e
|
|
||||||
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
|
|
||||||
|
|
||||||
# dataframe from csv
|
|
||||||
dfClassify['fake'] = False
|
|
||||||
|
|
||||||
|
|
||||||
#%%
|
|
||||||
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
|
|
||||||
# HowTo:
|
|
||||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
|
||||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
|
||||||
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
|
|
||||||
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
|
||||||
|
|
||||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
|
||||||
|
|
||||||
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(remove_URL)
|
|
||||||
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_emoji)
|
|
||||||
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_html)
|
|
||||||
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_punct)
|
|
||||||
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(lambda x: x.lower())
|
|
||||||
|
|
||||||
#%%
|
|
||||||
# remove empty rows
|
|
||||||
dfClassify.cleanContent.replace('',np.nan,inplace=True)
|
|
||||||
dfClassify.dropna(subset=['cleanContent'], inplace=True)
|
|
||||||
|
|
||||||
#%%
|
|
||||||
timeStart = datetime.now() # start counting execution time
|
|
||||||
|
|
||||||
max_length = 128
|
|
||||||
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
|
||||||
#train.rename(columns={'target': 'labels'}, inplace=True)
|
|
||||||
#train.head()
|
|
||||||
|
|
||||||
# %%
|
|
||||||
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
|
|
||||||
|
|
||||||
#%%
|
|
||||||
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
|
|
||||||
|
|
||||||
# %%from datetime import datetime
|
|
||||||
|
|
||||||
#from tqdm.auto import tqdm
|
|
||||||
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
|
|
||||||
# print(out)
|
|
||||||
|
|
||||||
#%%
|
|
||||||
output_labels = []
|
|
||||||
output_score = []
|
|
||||||
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
|
|
||||||
output_labels.append(out['label'])
|
|
||||||
output_score.append(out['score'])
|
|
||||||
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
|
||||||
# Exactly the same output as before, but the content are passed
|
|
||||||
# as batches to the model
|
|
||||||
# %%
|
|
||||||
dfClassify['output_label'] = output_labels
|
|
||||||
dfClassify['output_score'] = output_score
|
|
||||||
|
|
||||||
timeEnd = datetime.now()
|
|
||||||
timeTotal = timeEnd - timeStart
|
|
||||||
timePerTweet = timeTotal / 96
|
|
||||||
|
|
||||||
print(f"Total classification execution time: {timeTotal} seconds")
|
|
||||||
print(f"Time per tweet classification: {timePerTweet}")
|
|
||||||
|
|
||||||
# %%
|
|
||||||
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
|
|
||||||
|
|
||||||
# %%
|
|
||||||
@@ -1,12 +1,9 @@
|
|||||||
import re
|
|
||||||
import string
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from transformers.pipelines.pt_utils import KeyDataset
|
from transformers.pipelines.pt_utils import KeyDataset
|
||||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
@@ -26,11 +23,11 @@ di = "data/IN/"
|
|||||||
ud = "data/OUT/"
|
ud = "data/OUT/"
|
||||||
|
|
||||||
# Name of file that all senator data will be written to
|
# Name of file that all senator data will be written to
|
||||||
senCSV = "SenatorsTweets-OnlyCov.csv"
|
senCSV = "Tweets-Classified-Topic-Results.csv"
|
||||||
|
|
||||||
# Name of Classify datafile
|
# Name of Classify datafile
|
||||||
senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
|
senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv"
|
||||||
senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
|
senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv"
|
||||||
|
|
||||||
# don't change this one
|
# don't change this one
|
||||||
senCSVPath = wd + ud + senCSV
|
senCSVPath = wd + ud + senCSV
|
||||||
@@ -46,6 +43,16 @@ import CleanTweets
|
|||||||
#%%
|
#%%
|
||||||
# get datafra,e
|
# get datafra,e
|
||||||
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
|
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
|
||||||
|
def encode_labels(label):
|
||||||
|
if label == 'True':
|
||||||
|
return 'False'
|
||||||
|
elif label == 'False':
|
||||||
|
return 'True'
|
||||||
|
return 0
|
||||||
|
dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
|
||||||
|
dfClassify.to_csv("/home/michael/Documents/PS/Data/collectTweets/data/OUT/Tweets-Classified-Topic-Results.csv", encoding='utf-8')
|
||||||
|
|
||||||
|
dfClassify = dfClassify[dfClassify['output_label_topicCov']=='True']
|
||||||
|
|
||||||
# dataframe from csv
|
# dataframe from csv
|
||||||
dfClassify['fake'] = False
|
dfClassify['fake'] = False
|
||||||
@@ -56,9 +63,9 @@ dfClassify['fake'] = False
|
|||||||
# HowTo:
|
# HowTo:
|
||||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
||||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
||||||
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
|
pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
||||||
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
||||||
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
||||||
|
|
||||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||||
|
|
||||||
@@ -100,8 +107,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun
|
|||||||
# Exactly the same output as before, but the content are passed
|
# Exactly the same output as before, but the content are passed
|
||||||
# as batches to the model
|
# as batches to the model
|
||||||
# %%
|
# %%
|
||||||
dfClassify['output_label'] = output_labels
|
dfClassify['output_label_fake'] = output_labels
|
||||||
dfClassify['output_score'] = output_score
|
dfClassify['output_score_fake'] = output_score
|
||||||
|
|
||||||
timeEnd = datetime.now()
|
timeEnd = datetime.now()
|
||||||
timeTotal = timeEnd - timeStart
|
timeTotal = timeEnd - timeStart
|
||||||
|
|||||||
@@ -1,12 +1,9 @@
|
|||||||
import re
|
|
||||||
import string
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from transformers.pipelines.pt_utils import KeyDataset
|
from transformers.pipelines.pt_utils import KeyDataset
|
||||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
@@ -99,8 +96,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun
|
|||||||
# Exactly the same output as before, but the content are passed
|
# Exactly the same output as before, but the content are passed
|
||||||
# as batches to the model
|
# as batches to the model
|
||||||
# %%
|
# %%
|
||||||
dfClassify['output_label'] = output_labels
|
dfClassify['output_label_topicCov'] = output_labels
|
||||||
dfClassify['output_score'] = output_score
|
dfClassify['output_score_topicCov'] = output_score
|
||||||
|
|
||||||
timeEnd = datetime.now()
|
timeEnd = datetime.now()
|
||||||
timeTotal = timeEnd - timeStart
|
timeTotal = timeEnd - timeStart
|
||||||
@@ -113,3 +110,14 @@ print(f"Time per tweet classification: {timePerTweet}")
|
|||||||
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
|
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
|
## corrections
|
||||||
|
def encode_labels(label):
|
||||||
|
if label == 'real':
|
||||||
|
return 'True'
|
||||||
|
elif label == 'fake':
|
||||||
|
return 'False'
|
||||||
|
return 0
|
||||||
|
dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
|
||||||
|
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
|
||||||
|
#still wrong, will be corrected in ClassificationFake.py
|
||||||
|
|
||||||
|
|||||||
132
README.md
132
README.md
@@ -1,7 +1,131 @@
|
|||||||
# How to use
|
# Requirements
|
||||||
|
|
||||||
Execute collect.py to scrape tweets and generate the ´ALL-SENATORS-TWEETS.csv´.
|
- python 3.10+
|
||||||
|
- snscrape 0.6.2.20230321+ (see git repo in this folder)
|
||||||
|
- transformers 4.31.0
|
||||||
|
- numpy 1.23.5
|
||||||
|
- pandas 2.0.3
|
||||||
|
- scikit-learn 1.3.0
|
||||||
|
- torch 2.0.1
|
||||||
|
|
||||||
Execute collectSenData.py to scrape senator data and generate ´ALL-SENATORS.csv´.
|
# About
|
||||||
|
|
||||||
All new files will be written to ´data/OUT/´. Necessary data has to be located in ´data/IN/´
|
This collection of scripts scrapes tweets of US-senators in the time from 2020-01-01T00:00:00Z to 2023-01-03T00:00:00Z, scrapes account data of the senators, prepares the tweets for the training of a NLP-model, trains two models to (1) classify the tweets topic as covid or non-covid and (2) the tweets as either "fake news" tweets or "non-fake news" tweets.
|
||||||
|
Training only works with a prepared dataset in which the tweets are pre classified.
|
||||||
|
More info in the comments of the scripts.
|
||||||
|
Due to time constraints, most of the code is procedurally coded and ugly but effective.
|
||||||
|
|
||||||
|
# How to
|
||||||
|
|
||||||
|
Tested on Ubuntu 22.04.
|
||||||
|
If needed, the virual environment can be exported and send to you.
|
||||||
|
|
||||||
|
All files in the folder data/in have to exist in order to execute the scripts.
|
||||||
|
Execute in the following order:
|
||||||
|
|
||||||
|
01 collect.py (see more for further info on scraping)
|
||||||
|
02 collectSenData.py
|
||||||
|
03 cleanTweets
|
||||||
|
04 preTestClassification.py
|
||||||
|
05 trainTopic.py
|
||||||
|
06 trainFake.py
|
||||||
|
07 ClassificationFake.py
|
||||||
|
08 ClassificationTopic.py
|
||||||
|
|
||||||
|
# Files & Folders
|
||||||
|
|
||||||
|
Datafiles are not included in the repository but can be found in the full package that can be downloaded from [here](https://ncloud.mischbeck.de/s/T4QcMDSfYSkadYC) (password protected).
|
||||||
|
|
||||||
|
```
|
||||||
|
├── data
|
||||||
|
│ ├── IN
|
||||||
|
│ │ ├── counterKeywordsFinal.txt
|
||||||
|
│ │ ├── counterKeywords.txt
|
||||||
|
│ │ ├── keywords-raw.txt
|
||||||
|
│ │ ├── keywords.txt
|
||||||
|
│ │ ├── own_keywords.txt
|
||||||
|
│ │ ├── pretest-tweets_fake.txt contains tweet ids for pretest
|
||||||
|
│ │ ├── pretest-tweets_not_fake.txt contains tweet ids for pretest
|
||||||
|
│ │ └── senators-raw.csv senator datafile
|
||||||
|
│ ├── OUT
|
||||||
|
│ │ ├── ALL-SENATORS-TWEETS.csv
|
||||||
|
│ │ ├── graphs
|
||||||
|
│ │ │ ├── Timeline.png
|
||||||
|
│ │ │ ├── Wordcloud-All.png
|
||||||
|
│ │ │ └── Wordcloud-Cov.png
|
||||||
|
│ │ ├── Pretest-Prep.csv
|
||||||
|
│ │ ├── Pretest-Results.csv
|
||||||
|
│ │ ├── Pretest-SENATORS-TWEETS.csv
|
||||||
|
│ │ ├── profiles dataset profiles
|
||||||
|
│ │ │ ├── AllTweets.html
|
||||||
|
│ │ │ └── CovTweets.html
|
||||||
|
│ │ ├── SenatorsTweets-Final.csv
|
||||||
|
│ │ ├── SenatorsTweets-OnlyCov.csv
|
||||||
|
│ │ ├── SenatorsTweets-train-CovClassification.csv
|
||||||
|
│ │ ├── SenatorsTweets-train-CovClassificationTRAIN.csv
|
||||||
|
│ │ ├── SenatorsTweets-train-CovClassification.tsv
|
||||||
|
│ │ ├── SenatorsTweets-train-FakeClassification.csv
|
||||||
|
│ │ ├── SenatorsTweets-train-FakeClassificationTRAIN.csv
|
||||||
|
│ │ ├── SenatorsTweets-train-FakeClassification.tsv
|
||||||
|
│ │ ├── SenatorsTweets-Training.csv
|
||||||
|
│ │ ├── SenatorsTweets-Training_WORKING-COPY.csv
|
||||||
|
│ │ ├── topClass-PRETEST-Prep.csv
|
||||||
|
│ │ ├── topClass-PRETEST-Results.csv
|
||||||
|
│ │ ├── Tweets-All-slices.zip
|
||||||
|
│ │ ├── Tweets-Classified-Fake-Prep.csv
|
||||||
|
│ │ ├── Tweets-Classified-Fake-Results.csv
|
||||||
|
│ │ ├── Tweets-Classified-Prep.csv
|
||||||
|
│ │ ├── Tweets-Classified-Topic-Prep.csv
|
||||||
|
│ │ ├── Tweets-Classified-Topic-Results.csv
|
||||||
|
│ │ └── Tweets-Stub.csv
|
||||||
|
├── funs
|
||||||
|
│ ├── CleanTweets.py 2023-01-03T00:00:00Z multiple functions to clean tweet contents for NLN-processing
|
||||||
|
│ ├── ClearDupes.py function for deletion of duplicate keywords
|
||||||
|
│ ├── __init__.py
|
||||||
|
│ ├── Scrape.py scraper functions to be used for multiprocessing
|
||||||
|
│ └── TimeSlice.py time slice script to slice the time span in 24 slices, speeds up scraping through multiprocessing
|
||||||
|
├── log logs of the scraping process
|
||||||
|
│ ├── log_2023-06-23_21-06-10_err.log
|
||||||
|
│ ├── log_2023-06-23_21-06-10.log
|
||||||
|
│ └── log_2023-06-23_21-06-10_missing.log
|
||||||
|
├── models
|
||||||
|
│ ├── CovClass Covid tweet classification model
|
||||||
|
│ │ └── 2023-08-15_05-56-50
|
||||||
|
│ │ ├── 2023-08-15_05-56-50.csv training output
|
||||||
|
│ │ ├── config.json
|
||||||
|
│ │ ├── pytorch_model.bin
|
||||||
|
│ │ ├── special_tokens_map.json
|
||||||
|
│ │ ├── tokenizer_config.json
|
||||||
|
│ │ ├── tokenizer.json
|
||||||
|
│ │ └── vocab.txt
|
||||||
|
│ └── FakeClass Fake tweet classification model
|
||||||
|
│ └── 2023-08-15_14-35-43
|
||||||
|
│ ├── 2023-08-15_14-35-43.csv training output
|
||||||
|
│ ├── config.json
|
||||||
|
│ ├── pytorch_model.bin
|
||||||
|
│ ├── special_tokens_map.json
|
||||||
|
│ ├── tokenizer_config.json
|
||||||
|
│ ├── tokenizer.json
|
||||||
|
│ └── vocab.txt
|
||||||
|
├── snscrape contains snscrape 0.6.2.20230321+ git repo
|
||||||
|
├── ClassificationFake.py classifies tweets as fake or non-fake, saves:
|
||||||
|
│ Tweets-Classified-Fake-Prep.csv - prepared training dataset
|
||||||
|
│ Tweets-Classified-Fake-Results.csv - Tweets-Classified-Topic-Results.csv with cov classification results
|
||||||
|
├── ClassificationTopic.py classifies tweet topic, saves:
|
||||||
|
│ Tweets-Classified-Topic-Prep.csv - prepared training dataset
|
||||||
|
│ Tweets-Classified-Topic-Results.csv - SenatorsTweets-OnlyCov.csv with cov classification results
|
||||||
|
├── cleanTweets.py Curates keywordlists
|
||||||
|
│ Merges senator and tweet datasets
|
||||||
|
│ Creates multiple datasets:
|
||||||
|
│ SenatorsTweets-Final.csv - all tweets with keyword columns
|
||||||
|
│ SenatorsTweets-OnlyCov.csv - only covid tweets, filtered by keywordlist
|
||||||
|
│ SenatorsTweets-Training.csv - training dataset, containing ~1800 randomly selected tweets from SenatorsTweets-OnlyCov.csv
|
||||||
|
├── collect.py scrapes tweets, saves to ALL-SENATORS-TWEETS.csv
|
||||||
|
├── collectSenData.py scrapes senator account data, saves to ALL-SENATORS.csv
|
||||||
|
├── createGraphs.py creates wordcloud & timeline graphs
|
||||||
|
├── preTestClassification.py pretest script that uses bvrau/covid-twitter-bert-v2-struth to analyze 100 preclassified tweets
|
||||||
|
├── profiler.py creates dataset profiles
|
||||||
|
├── README.md readme
|
||||||
|
├── trainFake.py training script for the fake tweet classification model
|
||||||
|
└── trainTopic.py training script for the tweet topic classification model
|
||||||
|
```
|
||||||
129
analyze.py
129
analyze.py
@@ -1,129 +0,0 @@
|
|||||||
import re
|
|
||||||
import string
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
|
||||||
from datasets import load_dataset
|
|
||||||
from transformers.pipelines.pt_utils import KeyDataset
|
|
||||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
|
||||||
|
|
||||||
|
|
||||||
#%%
|
|
||||||
# prepare
|
|
||||||
# install xformers (pip install xformers) for better performance
|
|
||||||
###################
|
|
||||||
# Setup directories
|
|
||||||
# WD Michael
|
|
||||||
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
|
||||||
# WD Server
|
|
||||||
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
|
||||||
|
|
||||||
# datafile input directory
|
|
||||||
di = "data/IN/"
|
|
||||||
|
|
||||||
# Tweet-datafile output directory
|
|
||||||
ud = "data/OUT/"
|
|
||||||
|
|
||||||
# Name of file that all senator data will be written to
|
|
||||||
senCSV = "ALL-SENATORS-TWEETS.csv"
|
|
||||||
|
|
||||||
# Name of new datafile generated
|
|
||||||
senCSVc = "Tweets-Stub.csv"
|
|
||||||
|
|
||||||
# Name of pretest files
|
|
||||||
preTestIDsFake = "pretest-tweets_fake.txt"
|
|
||||||
preTestIDsNot = "pretest-tweets_not_fake.txt"
|
|
||||||
|
|
||||||
# Name of pretest datafile
|
|
||||||
senCSVPretest = "Pretest.csv"
|
|
||||||
senCSVPretestPrep = "Pretest-Prep.csv"
|
|
||||||
senCSVPretestResult = "Pretest-Results.csv"
|
|
||||||
|
|
||||||
|
|
||||||
# don't change this one
|
|
||||||
senCSVPath = wd + ud + senCSV
|
|
||||||
senCSVcPath = wd + ud + senCSVc
|
|
||||||
senCSVcPretestPath = wd + ud + senCSVPretest
|
|
||||||
senCSVcPretestPrepPath = wd + ud + senCSVPretestPrep
|
|
||||||
senCSVcPretestResultPath = wd + ud + senCSVPretestResult
|
|
||||||
preTestIDsFakePath = wd + di + preTestIDsFake
|
|
||||||
preTestIDsNotPath = wd + di + preTestIDsNot
|
|
||||||
|
|
||||||
# List of IDs to select
|
|
||||||
# Read the IDs from a file
|
|
||||||
preTestIDsFakeL = []
|
|
||||||
preTestIDsNotL = []
|
|
||||||
with open(preTestIDsFakePath, "r") as file:
|
|
||||||
lines = file.readlines()
|
|
||||||
for line in lines:
|
|
||||||
tid = line.strip() # Remove the newline character
|
|
||||||
preTestIDsFakeL.append(tid)
|
|
||||||
with open(preTestIDsNotPath, "r") as file:
|
|
||||||
lines = file.readlines()
|
|
||||||
for line in lines:
|
|
||||||
tid = line.strip() # Remove the newline character
|
|
||||||
preTestIDsNotL.append(tid)
|
|
||||||
|
|
||||||
# Select rows based on the IDs
|
|
||||||
df = pd.read_csv(senCSVPath, dtype=(object))
|
|
||||||
#%%
|
|
||||||
# Create pretest dataframe
|
|
||||||
dfPreTest = df[df['id'].isin(preTestIDsFakeL)].copy()
|
|
||||||
dfPreTest['fake'] = True
|
|
||||||
dfPreTest = pd.concat([dfPreTest, df[df['id'].isin(preTestIDsNotL)]], ignore_index=True)
|
|
||||||
dfPreTest['fake'] = dfPreTest['fake'].fillna(False)
|
|
||||||
|
|
||||||
#%%
|
|
||||||
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
|
|
||||||
# HowTo:
|
|
||||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
|
||||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
|
||||||
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
|
|
||||||
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
|
||||||
|
|
||||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
|
||||||
|
|
||||||
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
|
|
||||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
|
|
||||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html)
|
|
||||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct)
|
|
||||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower())
|
|
||||||
|
|
||||||
#%%
|
|
||||||
max_length = 128
|
|
||||||
dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
|
||||||
#train.rename(columns={'target': 'labels'}, inplace=True)
|
|
||||||
#train.head()
|
|
||||||
|
|
||||||
# %%
|
|
||||||
dfPreTest.to_csv(senCSVcPretestPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
|
|
||||||
|
|
||||||
|
|
||||||
#%%
|
|
||||||
dataset = load_dataset("csv", data_files=senCSVcPretestPrepPath)
|
|
||||||
|
|
||||||
# %%
|
|
||||||
results = pipe(KeyDataset(dataset, "text"))
|
|
||||||
# %%
|
|
||||||
#from tqdm.auto import tqdm
|
|
||||||
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
|
|
||||||
# print(out)
|
|
||||||
|
|
||||||
#%%
|
|
||||||
output_labels = []
|
|
||||||
output_score = []
|
|
||||||
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
|
|
||||||
output_labels.append(out['label'])
|
|
||||||
output_score.append(out['score'])
|
|
||||||
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
|
||||||
# Exactly the same output as before, but the content are passed
|
|
||||||
# as batches to the model
|
|
||||||
# %%
|
|
||||||
dfPreTest['output_label'] = output_labels
|
|
||||||
dfPreTest['output_score'] = output_score
|
|
||||||
|
|
||||||
# %%
|
|
||||||
dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8')
|
|
||||||
|
|
||||||
# %%
|
|
||||||
@@ -9,7 +9,8 @@ Created on Mon Jun 26 20:36:43 2023
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
# import pyreadstat
|
# import pyreadstat
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from funs.ClearDupes import deDupe
|
import sys
|
||||||
|
|
||||||
|
|
||||||
# Seet for training dataset generation
|
# Seet for training dataset generation
|
||||||
seed = 86431891
|
seed = 86431891
|
||||||
@@ -49,6 +50,11 @@ senDatasetPath = wd + di + senDataset
|
|||||||
|
|
||||||
df = pd.read_csv(senCSVPath, dtype=(object))
|
df = pd.read_csv(senCSVPath, dtype=(object))
|
||||||
|
|
||||||
|
## Import own functions
|
||||||
|
funs = wd+"funs"
|
||||||
|
sys.path.insert(1, funs)
|
||||||
|
from ClearDupes import deDupe
|
||||||
|
|
||||||
mixed_columns = df.columns[df.nunique() != len(df)]
|
mixed_columns = df.columns[df.nunique() != len(df)]
|
||||||
print(mixed_columns)
|
print(mixed_columns)
|
||||||
|
|
||||||
|
|||||||
11
collect.py
11
collect.py
@@ -66,7 +66,6 @@ which is the final output.
|
|||||||
import os
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import glob
|
import glob
|
||||||
import time
|
|
||||||
import sys
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
@@ -149,10 +148,12 @@ tweetDFColumns = [
|
|||||||
################## do NOT change anything below this line ###################
|
################## do NOT change anything below this line ###################
|
||||||
#############################################################################
|
#############################################################################
|
||||||
|
|
||||||
## Import functions
|
## Import own functions
|
||||||
from funs.TimeSlice import *
|
funs = wd+"funs"
|
||||||
from funs.ClearDupes import deDupe
|
sys.path.insert(1, funs)
|
||||||
from funs.Scrape import scrapeTweets
|
from TimeSlice import get_Tslices
|
||||||
|
from ClearDupes import deDupe
|
||||||
|
from Scrape import scrapeTweets
|
||||||
|
|
||||||
###################
|
###################
|
||||||
# Create logfile & log all outputs
|
# Create logfile & log all outputs
|
||||||
|
|||||||
23
data/IN/counterKeywordsFinal.txt
Normal file
23
data/IN/counterKeywordsFinal.txt
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
meth
|
||||||
|
gun violence
|
||||||
|
flu season
|
||||||
|
vaping
|
||||||
|
chd
|
||||||
|
addiction
|
||||||
|
indigenous women
|
||||||
|
separating children
|
||||||
|
tobacco
|
||||||
|
e-cigarette
|
||||||
|
muslim ban
|
||||||
|
soleimani
|
||||||
|
cocaine
|
||||||
|
separating families
|
||||||
|
muslim travel ban
|
||||||
|
usmca trade deal
|
||||||
|
shooting
|
||||||
|
overdose
|
||||||
|
separated children
|
||||||
|
coronary heart disease
|
||||||
|
gun-violence
|
||||||
|
opioid
|
||||||
|
flu-season
|
||||||
@@ -18,44 +18,43 @@ socialdistancing
|
|||||||
wear a mask
|
wear a mask
|
||||||
lockdown
|
lockdown
|
||||||
covd
|
covd
|
||||||
Coronavirus
|
coronavirus
|
||||||
Koronavirus
|
koronavirus
|
||||||
Corona
|
corona
|
||||||
CDC
|
cdc
|
||||||
Wuhancoronavirus
|
wuhancoronavirus
|
||||||
Wuhanlockdown
|
wuhanlockdown
|
||||||
Ncov
|
ncov
|
||||||
Wuhan
|
wuhan
|
||||||
N95
|
n95
|
||||||
Kungflu
|
kungflu
|
||||||
Epidemic
|
epidemic
|
||||||
outbreak
|
outbreak
|
||||||
Sinophobia
|
sinophobia
|
||||||
China
|
|
||||||
covid-19
|
covid-19
|
||||||
corona virus
|
corona virus
|
||||||
covid
|
covid
|
||||||
covid19
|
covid19
|
||||||
sars-cov-2
|
sars-cov-2
|
||||||
COVIDー19
|
covidー19
|
||||||
COVD
|
covd
|
||||||
pandemic
|
pandemic
|
||||||
coronapocalypse
|
coronapocalypse
|
||||||
canceleverything
|
canceleverything
|
||||||
Coronials
|
coronials
|
||||||
SocialDistancingNow
|
socialdistancingnow
|
||||||
Social Distancing
|
social distancing
|
||||||
SocialDistancing
|
socialdistancing
|
||||||
panicbuy
|
panicbuy
|
||||||
panic buy
|
panic buy
|
||||||
panicbuying
|
panicbuying
|
||||||
panic buying
|
panic buying
|
||||||
14DayQuarantine
|
14dayquarantine
|
||||||
DuringMy14DayQuarantine
|
duringmy14dayquarantine
|
||||||
panic shop
|
panic shop
|
||||||
panic shopping
|
panic shopping
|
||||||
panicshop
|
panicshop
|
||||||
InMyQuarantineSurvivalKit
|
inmyquarantinesurvivalkit
|
||||||
panic-buy
|
panic-buy
|
||||||
panic-shop
|
panic-shop
|
||||||
coronakindness
|
coronakindness
|
||||||
@@ -65,7 +64,7 @@ chinesevirus
|
|||||||
stayhomechallenge
|
stayhomechallenge
|
||||||
stay home challenge
|
stay home challenge
|
||||||
sflockdown
|
sflockdown
|
||||||
DontBeASpreader
|
dontbeaspreader
|
||||||
lockdown
|
lockdown
|
||||||
lock down
|
lock down
|
||||||
shelteringinplace
|
shelteringinplace
|
||||||
@@ -79,13 +78,13 @@ flatten the curve
|
|||||||
china virus
|
china virus
|
||||||
chinavirus
|
chinavirus
|
||||||
quarentinelife
|
quarentinelife
|
||||||
PPEshortage
|
ppeshortage
|
||||||
saferathome
|
saferathome
|
||||||
stayathome
|
stayathome
|
||||||
stay at home
|
stay at home
|
||||||
stay home
|
stay home
|
||||||
stayhome
|
stayhome
|
||||||
GetMePPE
|
getmeppe
|
||||||
covidiot
|
covidiot
|
||||||
epitwitter
|
epitwitter
|
||||||
pandemie
|
pandemie
|
||||||
@@ -93,7 +92,7 @@ wear a mask
|
|||||||
wearamask
|
wearamask
|
||||||
kung flu
|
kung flu
|
||||||
covididiot
|
covididiot
|
||||||
COVID__19
|
covid__19
|
||||||
omicron
|
omicron
|
||||||
variant
|
variant
|
||||||
vaccine
|
vaccine
|
||||||
@@ -139,9 +138,7 @@ work from home
|
|||||||
workfromhome
|
workfromhome
|
||||||
working from home
|
working from home
|
||||||
workingfromhome
|
workingfromhome
|
||||||
ppe
|
|
||||||
n95
|
n95
|
||||||
ppe
|
|
||||||
n95
|
n95
|
||||||
covidiots
|
covidiots
|
||||||
covidiots
|
covidiots
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,7 @@
|
|||||||
|
epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
|
||||||
|
1,0.39025546515679493,0.40877932761593355,0.9103260869565217,0:10:21,0:00:40
|
||||||
|
2,0.3057803610952067,0.3502063500978377,0.9103260869565217,0:10:53,0:00:43
|
||||||
|
3,0.17910970049364833,0.27903796154904464,0.9375,0:10:30,0:00:38
|
||||||
|
4,0.09279396105943587,0.41342766528301267,0.904891304347826,0:11:03,0:00:43
|
||||||
|
5,0.06132459050129317,0.4468563502887264,0.9239130434782609,0:12:07,0:00:44
|
||||||
|
6,0.04195396880810895,0.4350045176675928,0.9266304347826086,0:11:21,0:00:40
|
||||||
|
@@ -1,13 +1,8 @@
|
|||||||
import re
|
|
||||||
import string
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from transformers.pipelines.pt_utils import KeyDataset
|
from transformers.pipelines.pt_utils import KeyDataset
|
||||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
# prepare
|
# prepare
|
||||||
@@ -40,7 +35,6 @@ senCSVPretest = "Pretest.csv"
|
|||||||
senCSVPretestPrep = "Pretest-Prep.csv"
|
senCSVPretestPrep = "Pretest-Prep.csv"
|
||||||
senCSVPretestResult = "Pretest-Results.csv"
|
senCSVPretestResult = "Pretest-Results.csv"
|
||||||
|
|
||||||
|
|
||||||
# don't change this one
|
# don't change this one
|
||||||
senCSVPath = wd + ud + senCSV
|
senCSVPath = wd + ud + senCSV
|
||||||
senCSVcPath = wd + ud + senCSVc
|
senCSVcPath = wd + ud + senCSVc
|
||||||
@@ -50,6 +44,11 @@ senCSVcPretestResultPath = wd + ud + senCSVPretestResult
|
|||||||
preTestIDsFakePath = wd + di + preTestIDsFake
|
preTestIDsFakePath = wd + di + preTestIDsFake
|
||||||
preTestIDsNotPath = wd + di + preTestIDsNot
|
preTestIDsNotPath = wd + di + preTestIDsNot
|
||||||
|
|
||||||
|
import sys
|
||||||
|
funs = wd+"funs"
|
||||||
|
sys.path.insert(1, funs)
|
||||||
|
import CleanTweets
|
||||||
|
|
||||||
# List of IDs to select
|
# List of IDs to select
|
||||||
# Read the IDs from a file
|
# Read the IDs from a file
|
||||||
preTestIDsFakeL = []
|
preTestIDsFakeL = []
|
||||||
@@ -85,11 +84,7 @@ tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
|||||||
|
|
||||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||||
|
|
||||||
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
|
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(CleanTweets.preprocess_text)
|
||||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
|
|
||||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html)
|
|
||||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct)
|
|
||||||
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower())
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
timeStart = datetime.now() # start counting execution time
|
timeStart = datetime.now() # start counting execution time
|
||||||
|
|||||||
35
repairmystupidity.py
Normal file
35
repairmystupidity.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Mon Aug 14 20:47:22 2023
|
||||||
|
|
||||||
|
@author: michael
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||||
|
|
||||||
|
# datafile input directory
|
||||||
|
di = "data/IN/"
|
||||||
|
|
||||||
|
# Tweet-datafile output directory
|
||||||
|
ud = "data/OUT/"
|
||||||
|
|
||||||
|
falsch = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct.csv"
|
||||||
|
richtig = wd + ud + "SenatorsTweets-Training.csv"
|
||||||
|
correct = wd + ud + "SenatorsTweets-Training_WORKING-COPY-correct2.csv"
|
||||||
|
|
||||||
|
# Name of new datafile generated
|
||||||
|
senCSVprep = "SenatorsTweets-Training_WORKING-COPY-prepared"
|
||||||
|
|
||||||
|
# don't change this one
|
||||||
|
falsch = pd.read_csv(falsch, dtype=(object), sep=";")
|
||||||
|
richtig = pd.read_csv(richtig, dtype=(object))
|
||||||
|
|
||||||
|
df = pd.merge(falsch,richtig[['tid','rawContent', 'date']],on='tid', how='left')
|
||||||
|
df.drop(columns=['rawContent_x', 'date_x'], inplace=True)
|
||||||
|
df.rename(columns={'tid_y':'tid', 'rawContent_y':'rawContent', 'date_y':'date'}, inplace=True)
|
||||||
|
df = df[['tid','date','topicCovid','fake','rawContent','Unnamed: 6']]
|
||||||
|
df.rename(columns={'Unnamed: 6':'comment'}, inplace=True)
|
||||||
|
|
||||||
|
df.to_csv(correct, encoding='utf-8', sep=";")
|
||||||
@@ -15,10 +15,8 @@ from sklearn.model_selection import train_test_split # pip install scikit-learn
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
## Follow these two guides:
|
## Uses snippets from this guide:
|
||||||
# best one https://mccormickml.com/2019/07/22/BERT-fine-tuning/
|
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
|
||||||
# https://xiangyutang2.github.io/tweet-classification/
|
|
||||||
# https://medium.com/mlearning-ai/fine-tuning-bert-for-tweets-classification-ft-hugging-face-8afebadd5dbf
|
|
||||||
|
|
||||||
###################
|
###################
|
||||||
# Setup directories
|
# Setup directories
|
||||||
|
|||||||
@@ -15,10 +15,8 @@ from sklearn.model_selection import train_test_split # pip install scikit-learn
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
## Follow these two guides:
|
## Uses snippets from this guide:
|
||||||
# best one https://mccormickml.com/2019/07/22/BERT-fine-tuning/
|
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
|
||||||
# https://xiangyutang2.github.io/tweet-classification/
|
|
||||||
# https://medium.com/mlearning-ai/fine-tuning-bert-for-tweets-classification-ft-hugging-face-8afebadd5dbf
|
|
||||||
|
|
||||||
###################
|
###################
|
||||||
# Setup directories
|
# Setup directories
|
||||||
@@ -65,11 +63,7 @@ seed = 12355
|
|||||||
modCovClassPath = wd + "models/CovClass/"
|
modCovClassPath = wd + "models/CovClass/"
|
||||||
modFakeClassPath = wd + "models/FakeClass/"
|
modFakeClassPath = wd + "models/FakeClass/"
|
||||||
|
|
||||||
model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' # accuracy 69
|
|
||||||
#model_name = 'justinqbui/bertweet-covid19-base-uncased-pretraining-covid-vaccine-tweets' #48
|
|
||||||
#model_name = "cardiffnlp/tweet-topic-latest-multi"
|
|
||||||
model_name = "bvrau/covid-twitter-bert-v2-struth"
|
model_name = "bvrau/covid-twitter-bert-v2-struth"
|
||||||
#model_name = "cardiffnlp/roberta-base-tweet-topic-single-all"
|
|
||||||
model_fake_name = 'bvrau/covid-twitter-bert-v2-struth'
|
model_fake_name = 'bvrau/covid-twitter-bert-v2-struth'
|
||||||
|
|
||||||
# More models for fake detection:
|
# More models for fake detection:
|
||||||
Reference in New Issue
Block a user