|
|
|
|
@@ -1,12 +1,9 @@
|
|
|
|
|
import re
|
|
|
|
|
import string
|
|
|
|
|
import numpy as np
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
|
|
|
|
from datasets import load_dataset
|
|
|
|
|
from transformers.pipelines.pt_utils import KeyDataset
|
|
|
|
|
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#%%
|
|
|
|
|
@@ -26,11 +23,11 @@ di = "data/IN/"
|
|
|
|
|
ud = "data/OUT/"
|
|
|
|
|
|
|
|
|
|
# Name of file that all senator data will be written to
|
|
|
|
|
senCSV = "SenatorsTweets-OnlyCov.csv"
|
|
|
|
|
senCSV = "Tweets-Classified-Topic-Results.csv"
|
|
|
|
|
|
|
|
|
|
# Name of Classify datafile
|
|
|
|
|
senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
|
|
|
|
|
senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
|
|
|
|
|
senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv"
|
|
|
|
|
senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv"
|
|
|
|
|
|
|
|
|
|
# don't change this one
|
|
|
|
|
senCSVPath = wd + ud + senCSV
|
|
|
|
|
@@ -46,6 +43,16 @@ import CleanTweets
|
|
|
|
|
#%%
|
|
|
|
|
# get datafra,e
|
|
|
|
|
dfClassify = pd.read_csv(senCSVPath, dtype=(object))
|
|
|
|
|
def encode_labels(label):
|
|
|
|
|
if label == 'True':
|
|
|
|
|
return 'False'
|
|
|
|
|
elif label == 'False':
|
|
|
|
|
return 'True'
|
|
|
|
|
return 0
|
|
|
|
|
dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
|
|
|
|
|
dfClassify.to_csv("/home/michael/Documents/PS/Data/collectTweets/data/OUT/Tweets-Classified-Topic-Results.csv", encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
dfClassify = dfClassify[dfClassify['output_label_topicCov']=='True']
|
|
|
|
|
|
|
|
|
|
# dataframe from csv
|
|
|
|
|
dfClassify['fake'] = False
|
|
|
|
|
@@ -56,9 +63,9 @@ dfClassify['fake'] = False
|
|
|
|
|
# HowTo:
|
|
|
|
|
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
|
|
|
|
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
|
|
|
|
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
|
|
|
|
pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
|
|
|
|
|
|
|
|
|
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
|
|
|
|
|
|
|
|
|
@@ -100,8 +107,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun
|
|
|
|
|
# Exactly the same output as before, but the content are passed
|
|
|
|
|
# as batches to the model
|
|
|
|
|
# %%
|
|
|
|
|
dfClassify['output_label'] = output_labels
|
|
|
|
|
dfClassify['output_score'] = output_score
|
|
|
|
|
dfClassify['output_label_fake'] = output_labels
|
|
|
|
|
dfClassify['output_score_fake'] = output_score
|
|
|
|
|
|
|
|
|
|
timeEnd = datetime.now()
|
|
|
|
|
timeTotal = timeEnd - timeStart
|
|
|
|
|
|