renames pretest classification file

adds classification file. adds removal of empty tweets after transormation for classification preparation
adds more counter keywords
2023-08-08 00:06:18 +02:00 · 2023-08-08 00:04:14 +02:00 · 2023-08-08 00:03:30 +02:00 · 2023-08-07 23:45:11 +02:00
4 changed files with 302 additions and 2 deletions
--- a/Classification.py
+++ b/Classification.py
@@ -0,0 +1,113 @@
 import re
 import string
 import numpy as np
 import pandas as pd
 from datetime import datetime
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 from datasets import load_dataset
 from transformers.pipelines.pt_utils import KeyDataset
 from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
 #%%
 # prepare & define paths
 # install xformers (pip install xformers) for better performance
 ###################
 # Setup directories
 # WD Michael
 wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 # datafile input directory
 di = "data/IN/"
 # Tweet-datafile output directory
 ud = "data/OUT/"
 # Name of file that all senator data will be written to
 senCSV = "SenatorsTweets-OnlyCov.csv"
 # Name of Classify datafile
 senCSVClassifiedPrep = "Tweets-Classified-Prep.csv"
 senCSVClassifiedResult = "Tweets-Classified-Results.csv"
 # don't change this one
 senCSVPath = wd + ud + senCSV
 senCSVcClassificationPrepPath = wd + ud + senCSVClassifiedPrep
 senCSVcClassificationResultPath = wd + ud + senCSVClassifiedResult
 #%%
 # get datafra,e
 dfClassify = pd.read_csv(senCSVPath, dtype=(object))
 # dataframe from csv
 dfClassify['fake'] = False
 #%%
 # https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
 # HowTo:
 # https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
 # https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
 pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
 model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
 tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
 # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
 dfClassify['cleanContent'] = dfClassify['rawContent'].apply(remove_URL)
 dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_emoji)
 dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_html)
 dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(remove_punct)
 dfClassify['cleanContent'] = dfClassify['cleanContent'].apply(lambda x: x.lower())
 #%%
 # remove empty rows
 dfClassify.cleanContent.replace('',np.nan,inplace=True)
 dfClassify.dropna(subset=['cleanContent'], inplace=True)
 #%%
 timeStart = datetime.now() # start counting execution time
 max_length = 128
 dfClassify['input_ids'] = dfClassify['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
 #train.rename(columns={'target': 'labels'}, inplace=True)
 #train.head()
 # %%
 dfClassify.to_csv(senCSVcClassificationPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
 #%%
 dataset = load_dataset("csv", data_files=senCSVcClassificationPrepPath)
 # %%from datetime import datetime
 #from tqdm.auto import tqdm
 #for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
 #    print(out)
 #%% 
 output_labels = []
 output_score = []
 for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
    output_labels.append(out['label'])
    output_score.append(out['score'])
    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
    # Exactly the same output as before, but the content are passed
    # as batches to the model
 # %%
 dfClassify['output_label'] = output_labels
 dfClassify['output_score'] = output_score
 timeEnd = datetime.now()
 timeTotal = timeEnd - timeStart
 timePerTweet = timeTotal / 96
 print(f"Total classification execution time: {timeTotal} seconds")
 print(f"Time per tweet classification: {timePerTweet}")
 # %%
 dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
 # %%
--- a/cleanTweets.py
+++ b/cleanTweets.py
@@ -88,7 +88,7 @@ with open(f"{di}keywords-raw.txt", "r") as file:
 # delete keywords ppe and china that lead to too many false positives
 removeWords = {'ppe', 'china'}
-keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive
+keywords = [x.lower() for x in keywords] # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
 keywords = [item for item in keywords if item not in removeWords ] # removes words
 with open(f"{di}keywords.txt", "w") as file:
@@ -96,17 +96,38 @@ with open(f"{di}keywords.txt", "w") as file:
    for line in keywords:
        file.write(f'{line}\n')
 # counter keywords
 # Read the keywords from a file
 counterKeywords = []
 with open(f"{di}counterKeywords.txt", "r") as file:
    lines = file.readlines()
    for line in lines:
        counterKeyword = line.strip()  # Remove the newline character
        counterKeywords.append(counterKeyword)
 counterKeywords = set([x.lower() for x in counterKeywords]) # converts to lowercase which makes the search case insensitive. convert to set to speed up comparison
 with open(f"{di}counterKeywordsFinal.txt", "w") as file:
    print("read keyword files")
    for line in counterKeywords:
        file.write(f'{line}\n')
 #%%
 # overwrite keyword column
 df['keywords'] = np.nan
 df['keywords'] = (
    df['rawContent'].str.lower().str.findall('|'.join(keywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
 )
 df['counterKeywords'] = np.nan
 df['counterKeywords'] = (
    df['rawContent'].str.lower().str.findall('|'.join(counterKeywords)).str.join(',').replace('', np.nan) # str.lower to make search case-insensitive
 )
 #%%
 # create boolean contains_keyword column
 df['contains_keyword'] = True
 df['contains_counterKeyword'] = True
 mask = (df['keywords'].isna()) # select all values in contains_keyword == 'none'
 df.loc[mask,'contains_keyword'] = False # set keywords = contains_keyword under the condition of mask
 mask = (df['counterKeywords'].isna()) # select all values in contains_keyword == 'none'
 df.loc[mask,'contains_counterKeyword'] = False # set keywords = contains_keyword under the condition of mask
 #%%
 pd.Series(df["user.id"]).is_unique
@@ -163,7 +184,10 @@ print(unique_usernames)
 # senatorisakson was dropped, is ok
 #%%
 # create covidtweets csv
-dfCov = dfAll[dfAll['contains_keyword']==True]
+dfCov = dfAll[dfAll['contains_counterKeyword']==False]
 dfCov = dfCov[dfCov['contains_keyword']==True]
 dfCov = dfCov.drop(columns=['contains_counterKeyword', 'counterKeywords'])
 #%%
 # create column with tweet length
--- a/data/IN/counterKeywords.txt
+++ b/data/IN/counterKeywords.txt
@@ -0,0 +1,23 @@
 opioid
 gun violence
 gun-violence
 CHD
 Coronary heart disease
 addiction
 tobacco
 vaping
 e-cigarette
 shooting
 indigenous women
 overdose
 meth
 cocaine
 separated children
 separating children
 separating families
 Muslim travel ban 
 flu-season
 flu season
 Soleimani
 Muslim Ban
 USMCA trade deal
--- a/preTestClassification.py
+++ b/preTestClassification.py
@@ -0,0 +1,140 @@
 import re
 import string
 import numpy as np
 import pandas as pd
 from datetime import datetime
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 from datasets import load_dataset
 from transformers.pipelines.pt_utils import KeyDataset
 from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
 #%%
 # prepare
 # install xformers (pip install xformers) for better performance
 ###################
 # Setup directories
 # WD Michael
 wd = "/home/michael/Documents/PS/Data/collectTweets/"
 # WD Server
 # wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
 # datafile input directory
 di = "data/IN/"
 # Tweet-datafile output directory
 ud = "data/OUT/"
 # Name of file that all senator data will be written to
 senCSV = "ALL-SENATORS-TWEETS.csv"
 # Name of new datafile generated
 senCSVc = "Tweets-Stub.csv"
 # Name of pretest files
 preTestIDsFake = "pretest-tweets_fake.txt"
 preTestIDsNot = "pretest-tweets_not_fake.txt"
 # Name of pretest datafile
 senCSVPretest = "Pretest.csv"
 senCSVPretestPrep = "Pretest-Prep.csv"
 senCSVPretestResult = "Pretest-Results.csv"
 # don't change this one
 senCSVPath = wd + ud + senCSV
 senCSVcPath = wd + ud + senCSVc
 senCSVcPretestPath = wd + ud + senCSVPretest
 senCSVcPretestPrepPath = wd + ud + senCSVPretestPrep
 senCSVcPretestResultPath = wd + ud + senCSVPretestResult
 preTestIDsFakePath = wd + di + preTestIDsFake
 preTestIDsNotPath = wd + di + preTestIDsNot
 # List of IDs to select
 # Read the IDs from a file
 preTestIDsFakeL = []
 preTestIDsNotL  = []
 with open(preTestIDsFakePath, "r") as file:
    lines = file.readlines()
    for line in lines:
        tid = line.strip()  # Remove the newline character
        preTestIDsFakeL.append(tid)
 with open(preTestIDsNotPath, "r") as file:
    lines = file.readlines()
    for line in lines:
        tid = line.strip()  # Remove the newline character
        preTestIDsNotL.append(tid)
 # Select rows based on the IDs
 df = pd.read_csv(senCSVPath, dtype=(object))
 #%%
 # Create pretest dataframe
 dfPreTest = df[df['id'].isin(preTestIDsFakeL)].copy()
 dfPreTest['fake'] = True
 dfPreTest = pd.concat([dfPreTest, df[df['id'].isin(preTestIDsNotL)]], ignore_index=True)
 dfPreTest['fake'] = dfPreTest['fake'].fillna(False)
 #%%
 # https://huggingface.co/bvrau/covid-twitter-bert-v2-struth
 # HowTo:
 # https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
 # https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
 pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
 model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
 tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
 # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
 dfPreTest['cleanContent'] = dfPreTest['rawContent'].apply(remove_URL)
 dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_emoji)
 dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_html)
 dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(remove_punct)
 dfPreTest['cleanContent'] = dfPreTest['cleanContent'].apply(lambda x: x.lower())
 #%%
 timeStart = datetime.now() # start counting execution time
 max_length = 128
 dfPreTest['input_ids'] = dfPreTest['cleanContent'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
 #train.rename(columns={'target': 'labels'}, inplace=True)
 #train.head()
 # %%
 dfPreTest.to_csv(senCSVcPretestPrepPath, encoding='utf-8', columns=['id', 'cleanContent'])
 #%%
 dataset = load_dataset("csv", data_files=senCSVcPretestPrepPath)
 # %%
 results = pipe(KeyDataset(dataset, "text"))
 # %%
 #from tqdm.auto import tqdm
 #for out in tqdm(pipe(KeyDataset(dataset['train'], "cleanContent"))):
 #    print(out)
 #%% 
 output_labels = []
 output_score = []
 for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, truncation="only_first"):
    output_labels.append(out['label'])
    output_score.append(out['score'])
    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
    # Exactly the same output as before, but the content are passed
    # as batches to the model
 # %%
 dfPreTest['output_label'] = output_labels
 dfPreTest['output_score'] = output_score
 timeEnd = datetime.now()
 timeTotal = timeEnd - timeStart
 timePerTweet = timeTotal / 96
 print(f"Total classification execution time: {timeTotal} seconds")
 print(f"Time per tweet classification: {timePerTweet}")
 print(f"Estimated time for full classification of tweets: {timePerTweet*50183}")
 # %%
 dfPreTest.to_csv(senCSVcPretestResultPath, encoding='utf-8')
 # %%
Author	SHA1	Message	Date
Michael Beck	a26d150060	renames pretest classification file	2023-08-08 00:06:18 +02:00
Michael Beck	d791e4a293	adds classification file. adds removal of empty tweets after transormation for classification preparation	2023-08-08 00:04:14 +02:00
Michael Beck	d57b7a31b7	adds more counter keywords	2023-08-08 00:03:30 +02:00
Michael Beck	13d80124d3	adds lines with counterKeywords to remove non-covid tweets	2023-08-07 23:45:11 +02:00