finishes classification scripts

2023-08-16 10:06:16 +02:00 · 2023-08-15 14:51:28 +02:00
2 changed files with 31 additions and 16 deletions
--- a/ClassificationFake.py
+++ b/ClassificationFake.py
@@ -1,12 +1,9 @@
-import re
-import string
 import numpy as np
 import pandas as pd
 from datetime import datetime
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 from datasets import load_dataset
 from transformers.pipelines.pt_utils import KeyDataset
-from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct


 #%%
@@ -26,11 +23,11 @@ di = "data/IN/"
 ud = "data/OUT/"

 # Name of file that all senator data will be written to
-senCSV = "SenatorsTweets-OnlyCov.csv"
+senCSV = "Tweets-Classified-Topic-Results.csv"

 # Name of Classify datafile
-senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
-senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
+senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv"
+senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv"

 # don't change this one
 senCSVPath = wd + ud + senCSV
@@ -46,6 +43,16 @@ import CleanTweets
 #%%
 # get datafra,e
 dfClassify = pd.read_csv(senCSVPath, dtype=(object))
+def encode_labels(label):
+    if label == 'True':
+        return 'False'
+    elif label == 'False':
+        return 'True'
+    return 0
+dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
+dfClassify.to_csv("/home/michael/Documents/PS/Data/collectTweets/data/OUT/Tweets-Classified-Topic-Results.csv", encoding='utf-8')
+
+dfClassify = dfClassify[dfClassify['output_label_topicCov']=='True'] 

 # dataframe from csv
 dfClassify['fake'] = False
@@ -56,9 +63,9 @@ dfClassify['fake'] = False
 # HowTo:
 # https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
 # https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
-pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
-model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
-tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
+pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
+model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
+tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")

 # Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert

@@ -100,8 +107,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun
    # Exactly the same output as before, but the content are passed
    # as batches to the model
 # %%
-dfClassify['output_label'] = output_labels
-dfClassify['output_score'] = output_score
+dfClassify['output_label_fake'] = output_labels
+dfClassify['output_score_fake'] = output_score

 timeEnd = datetime.now()
 timeTotal = timeEnd - timeStart
--- a/ClassificationTopic.py
+++ b/ClassificationTopic.py
@@ -1,12 +1,9 @@
-import re
-import string
 import numpy as np
 import pandas as pd
 from datetime import datetime
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 from datasets import load_dataset
 from transformers.pipelines.pt_utils import KeyDataset
-from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct


 #%%
@@ -99,8 +96,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun
    # Exactly the same output as before, but the content are passed
    # as batches to the model
 # %%
-dfClassify['output_label'] = output_labels
-dfClassify['output_score'] = output_score
+dfClassify['output_label_topicCov'] = output_labels
+dfClassify['output_score_topicCov'] = output_score

 timeEnd = datetime.now()
 timeTotal = timeEnd - timeStart
@@ -113,3 +110,14 @@ print(f"Time per tweet classification: {timePerTweet}")
 dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')

 # %%
+## corrections
+def encode_labels(label):
+    if label == 'real':
+        return 'True'
+    elif label == 'fake':
+        return 'False'
+    return 0
+dfClassify['output_label_topicCov'] = dfClassify['output_label_topicCov'].apply(encode_labels)
+dfClassify.to_csv(senCSVcClassificationResultPath, encoding='utf-8')
+#still wrong, will be corrected in ClassificationFake.py
+
Author	SHA1	Message	Date
Michael Beck	4e08cde317	finishes classification scripts	2023-08-16 10:06:16 +02:00
Michael Beck	2535683cdc	finishes classification scripts	2023-08-15 14:51:28 +02:00