Compare commits
4 Commits
3de6d8f3ec
...
a26d150060
| Author | SHA1 | Date | |
|---|---|---|---|
| a26d150060 | |||
| d791e4a293 | |||
| d57b7a31b7 | |||
| 13d80124d3 |
113
Classification.py
Normal file
113
Classification.py
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
import re
|
||||||
|
import string
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from datetime import datetime
|
||||||
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||||
|
from datasets import load_dataset
|
||||||
|
from transformers.pipelines.pt_utils import KeyDataset
|
||||||
|
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
||||||
|
|
||||||
|
|
||||||
|
#%%
|
||||||
|
# prepare & define paths
|
||||||
|
# install xformers (pip install xformers) for better performance
|
||||||
|
###################
|
||||||
|
# Setup directories
|
||||||
|
# WD Michael
|
||||||
|
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||||
|
# WD Server
|
||||||
|
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||||
|
|
||||||
|
# datafile input directory
|
||||||
|
di = "data/IN/"
|
||||||
|
|
||||||
|
# Tweet-datafile output directory
|
||||||
|
ud = "data/OUT/"
|
||||||
|
|
||||||
|
# Name of file that all senator data will be written to
|
||||||
|
senCSV = "SenatorsTweets-OnlyCov.csv"
|
||||||
|
|
||||||
|
# Name of Classify datafile
|
||||||
|
senCSVClassifiedPrep = "Tweets-Classified-Prep.csv"
|
||||||
|
senCSVClassifiedResult = "Tweets-Classified-Results.csv"
|
||||||
|
|
||||||
|
# don't change this one
|
||||||
|
senCSVPath = wd + ud + senCSV
|
||||||
|
senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
|
||||||
|
senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
|
||||||
|
|
||||||
|
#%%
|
||||||
|
# get datafra,e
|
||||||
|
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
|
||||||
|
|
||||||
|
# dataframe from csv
|
||||||
|
dfClassify['fake'] = False
|
||||||
|
|
||||||
|
|
||||||
|
#%%
|
||||||
|
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
|
||||||
|
# HowTo:
|
||||||
|
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
||||||
|
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
||||||
|
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||||
|
|
||||||
|
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||||
|
|
||||||
|
dfClassify['cleanContent'] = dfClassify['rawContent'].apply(remove_URL)
|
||||||
|
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_emoji)
|
||||||
|
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_html)
|
||||||
|
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_punct)
|
||||||
|
dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(lambda x: x.lower())
|
||||||
|
|
||||||
|
#%%
|
||||||
|
# remove empty rows
|
||||||
|
dfClassify.cleanContent.replace('',np.nan,inplace=True)
|
||||||
|
dfClassify.dropna(subset=['cleanContent'], inplace=True)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
timeStart = datetime.now() # start counting execution time
|
||||||
|
|
||||||
|
max_length = 128
|
||||||
|
dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||||
|
#train.rename(columns={'target': 'labels'}, inplace=True)
|
||||||
|
#train.head()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
|
||||||
|
|
||||||
|
#%%
|
||||||
|
dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
|
||||||
|
|
||||||
|
# %%from datetime import datetime
|
||||||
|
|
||||||
|
#from tqdm.auto import tqdm
|
||||||
|
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
|
||||||
|
# print(out)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
output_labels = []
|
||||||
|
output_score = []
|
||||||
|
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
|
||||||
|
output_labels.append(out['label'])
|
||||||
|
output_score.append(out['score'])
|
||||||
|
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
||||||
|
# Exactly the same output as before, but the content are passed
|
||||||
|
# as batches to the model
|
||||||
|
# %%
|
||||||
|
dfClassify['output_label'] = output_labels
|
||||||
|
dfClassify['output_score'] = output_score
|
||||||
|
|
||||||
|
timeEnd = datetime.now()
|
||||||
|
timeTotal = timeEnd - timeStart
|
||||||
|
timePerTweet = timeTotal / 96
|
||||||
|
|
||||||
|
print(f"Total classification execution time: {timeTotal} seconds")
|
||||||
|
print(f"Time per tweet classification: {timePerTweet}")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
|
||||||
|
|
||||||
|
# %%
|
||||||
@@ -88,7 +88,7 @@ with open(f"{di}keywords-raw.txt", "r") as file:
|
|||||||
|
|
||||||
# delete keywords ppe and china that lead to too many false positives
|
# delete keywords ppe and china that lead to too many false positives
|
||||||
removeWords = {'ppe', 'china'}
|
removeWords = {'ppe', 'china'}
|
||||||
keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive
|
keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
|
||||||
keywords = [item for item in keywords if item not in removeWords ] # removes words
|
keywords = [item for item in keywords if item not in removeWords ] # removes words
|
||||||
|
|
||||||
with open(f"{di}keywords.txt", "w") as file:
|
with open(f"{di}keywords.txt", "w") as file:
|
||||||
@@ -96,17 +96,38 @@ with open(f"{di}keywords.txt", "w") as file:
|
|||||||
for line in keywords:
|
for line in keywords:
|
||||||
file.write(f'{line}\n')
|
file.write(f'{line}\n')
|
||||||
|
|
||||||
|
# counter keywords
|
||||||
|
# Read the keywords from a file
|
||||||
|
counterKeywords = []
|
||||||
|
with open(f"{di}counterKeywords.txt", "r") as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
for line in lines:
|
||||||
|
counterKeyword = line.strip() # Remove the newline character
|
||||||
|
counterKeywords.append(counterKeyword)
|
||||||
|
counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
|
||||||
|
with open(f"{di}counterKeywordsFinal.txt", "w") as file:
|
||||||
|
print("read keyword files")
|
||||||
|
for line in counterKeywords:
|
||||||
|
file.write(f'{line}\n')
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
# overwrite keyword column
|
# overwrite keyword column
|
||||||
df['keywords'] = np.nan
|
df['keywords'] = np.nan
|
||||||
df['keywords'] = (
|
df['keywords'] = (
|
||||||
df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
|
df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
|
||||||
)
|
)
|
||||||
|
df['counterKeywords'] = np.nan
|
||||||
|
df['counterKeywords'] = (
|
||||||
|
df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
|
||||||
|
)
|
||||||
#%%
|
#%%
|
||||||
# create boolean contains_keyword column
|
# create boolean contains_keyword column
|
||||||
df['contains_keyword'] = True
|
df['contains_keyword'] = True
|
||||||
|
df['contains_counterKeyword'] = True
|
||||||
mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
|
mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
|
||||||
df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
|
df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
|
||||||
|
mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none'
|
||||||
|
df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
pd.Series(df["user.id"]).is_unique
|
pd.Series(df["user.id"]).is_unique
|
||||||
@@ -163,7 +184,10 @@ print(unique_usernames)
|
|||||||
# senatorisakson was dropped, is ok
|
# senatorisakson was dropped, is ok
|
||||||
#%%
|
#%%
|
||||||
# create covidtweets csv
|
# create covidtweets csv
|
||||||
dfCov = dfAll[dfAll['contains_keyword']==True]
|
dfCov = dfAll[dfAll['contains_counterKeyword']==False]
|
||||||
|
dfCov = dfCov[dfCov['contains_keyword']==True]
|
||||||
|
dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
# create column with tweet length
|
# create column with tweet length
|
||||||
|
|||||||
23
data/IN/counterKeywords.txt
Normal file
23
data/IN/counterKeywords.txt
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
opioid
|
||||||
|
gun violence
|
||||||
|
gun-violence
|
||||||
|
CHD
|
||||||
|
Coronary heart disease
|
||||||
|
addiction
|
||||||
|
tobacco
|
||||||
|
vaping
|
||||||
|
e-cigarette
|
||||||
|
shooting
|
||||||
|
indigenous women
|
||||||
|
overdose
|
||||||
|
meth
|
||||||
|
cocaine
|
||||||
|
separated children
|
||||||
|
separating children
|
||||||
|
separating families
|
||||||
|
Muslim travel ban
|
||||||
|
flu-season
|
||||||
|
flu season
|
||||||
|
Soleimani
|
||||||
|
Muslim Ban
|
||||||
|
USMCA trade deal
|
||||||
140
preTestClassification.py
Normal file
140
preTestClassification.py
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
import re
|
||||||
|
import string
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from datetime import datetime
|
||||||
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||||
|
from datasets import load_dataset
|
||||||
|
from transformers.pipelines.pt_utils import KeyDataset
|
||||||
|
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
||||||
|
|
||||||
|
|
||||||
|
#%%
|
||||||
|
# prepare
|
||||||
|
# install xformers (pip install xformers) for better performance
|
||||||
|
###################
|
||||||
|
# Setup directories
|
||||||
|
# WD Michael
|
||||||
|
wd = "/home/michael/Documents/PS/Data/collectTweets/"
|
||||||
|
# WD Server
|
||||||
|
# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
|
||||||
|
|
||||||
|
# datafile input directory
|
||||||
|
di = "data/IN/"
|
||||||
|
|
||||||
|
# Tweet-datafile output directory
|
||||||
|
ud = "data/OUT/"
|
||||||
|
|
||||||
|
# Name of file that all senator data will be written to
|
||||||
|
senCSV = "ALL-SENATORS-TWEETS.csv"
|
||||||
|
|
||||||
|
# Name of new datafile generated
|
||||||
|
senCSVc = "Tweets-Stub.csv"
|
||||||
|
|
||||||
|
# Name of pretest files
|
||||||
|
preTestIDsFake = "pretest-tweets_fake.txt"
|
||||||
|
preTestIDsNot = "pretest-tweets_not_fake.txt"
|
||||||
|
|
||||||
|
# Name of pretest datafile
|
||||||
|
senCSVPretest = "Pretest.csv"
|
||||||
|
senCSVPretestPrep = "Pretest-Prep.csv"
|
||||||
|
senCSVPretestResult = "Pretest-Results.csv"
|
||||||
|
|
||||||
|
|
||||||
|
# don't change this one
|
||||||
|
senCSVPath = wd + ud + senCSV
|
||||||
|
senCSVcPath = wd + ud + senCSVc
|
||||||
|
senCSVcPretestPath = wd + ud + senCSVPretest
|
||||||
|
senCSVcPretestPrepPath = wd + ud + senCSVPretestPrep
|
||||||
|
senCSVcPretestResultPath = wd + ud + senCSVPretestResult
|
||||||
|
preTestIDsFakePath = wd + di + preTestIDsFake
|
||||||
|
preTestIDsNotPath = wd + di + preTestIDsNot
|
||||||
|
|
||||||
|
# List of IDs to select
|
||||||
|
# Read the IDs from a file
|
||||||
|
preTestIDsFakeL = []
|
||||||
|
preTestIDsNotL = []
|
||||||
|
with open(preTestIDsFakePath, "r") as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
for line in lines:
|
||||||
|
tid = line.strip() # Remove the newline character
|
||||||
|
preTestIDsFakeL.append(tid)
|
||||||
|
with open(preTestIDsNotPath, "r") as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
for line in lines:
|
||||||
|
tid = line.strip() # Remove the newline character
|
||||||
|
preTestIDsNotL.append(tid)
|
||||||
|
|
||||||
|
# Select rows based on the IDs
|
||||||
|
df = pd.read_csv(senCSVPath, dtype=(object))
|
||||||
|
#%%
|
||||||
|
# Create pretest dataframe
|
||||||
|
dfPreTest = df[df['id'].isin(preTestIDsFakeL)].copy()
|
||||||
|
dfPreTest['fake'] = True
|
||||||
|
dfPreTest = pd.concat([dfPreTest, df[df['id'].isin(preTestIDsNotL)]], ignore_index=True)
|
||||||
|
dfPreTest['fake'] = dfPreTest['fake'].fillna(False)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
# https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
|
||||||
|
# HowTo:
|
||||||
|
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
||||||
|
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
||||||
|
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
||||||
|
|
||||||
|
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||||
|
|
||||||
|
dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
|
||||||
|
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
|
||||||
|
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html)
|
||||||
|
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct)
|
||||||
|
dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower())
|
||||||
|
|
||||||
|
#%%
|
||||||
|
timeStart = datetime.now() # start counting execution time
|
||||||
|
|
||||||
|
max_length = 128
|
||||||
|
dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
|
||||||
|
#train.rename(columns={'target': 'labels'}, inplace=True)
|
||||||
|
#train.head()
|
||||||
|
|
||||||
|
# %%
|
||||||
|
dfPreTest.to_csv(senCSVcPretestPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
|
||||||
|
|
||||||
|
|
||||||
|
#%%
|
||||||
|
dataset = load_dataset("csv", data_files=senCSVcPretestPrepPath)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
results = pipe(KeyDataset(dataset, "text"))
|
||||||
|
# %%
|
||||||
|
#from tqdm.auto import tqdm
|
||||||
|
#for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
|
||||||
|
# print(out)
|
||||||
|
|
||||||
|
#%%
|
||||||
|
output_labels = []
|
||||||
|
output_score = []
|
||||||
|
for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
|
||||||
|
output_labels.append(out['label'])
|
||||||
|
output_score.append(out['score'])
|
||||||
|
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
||||||
|
# Exactly the same output as before, but the content are passed
|
||||||
|
# as batches to the model
|
||||||
|
# %%
|
||||||
|
dfPreTest['output_label'] = output_labels
|
||||||
|
dfPreTest['output_score'] = output_score
|
||||||
|
|
||||||
|
timeEnd = datetime.now()
|
||||||
|
timeTotal = timeEnd - timeStart
|
||||||
|
timePerTweet = timeTotal / 96
|
||||||
|
|
||||||
|
print(f"Total classification execution time: {timeTotal} seconds")
|
||||||
|
print(f"Time per tweet classification: {timePerTweet}")
|
||||||
|
print(f"Estimated time for full classification of tweets: {timePerTweet*50183}")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8')
|
||||||
|
|
||||||
|
# %%
|
||||||
Reference in New Issue
Block a user