Restructures. adds TimeSlice, ClearDupes and more comments.

2023-06-21 19:07:07 +02:00
parent 2e70d960a5
commit ea7fcc732e
7 changed files with 539 additions and 325 deletions
--- a/config.py
+++ b/config.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Wed Jun 21 13:58:42 2023
+
+@author: michael
+'''
+
+## Setup directories
+# WD Michael
+wd = '/home/michael/Documents/PS/Data/collectTweets/'
+# WD Server
+# wd = '/home/yunohost.multimedia/polsoc/Politics & Society/TweetCollection/'
+
+# Tweet-datafile output directory
+td = 'data/tweets/'
+
+# Name of file that all tweets will be written to
+file_alltweets = 'ALL-SENATORS-TWEETS.csv'
+
+path_to_tweetdfs = wd + td
+
+## Define Timespan 
+# Format: %Y-%m-%dT%H:%M:%SZ (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)
+ts_beg = '2020-01-01T00:00:00Z' # start of scraping
+ts_end = '2023-01-03T00:00:00Z' # end of straping
+no_slices = 24 # Number of slices / time periods.
+
+# Maximum tweets to be scraped by snscrape. Can be left untouched.
+maxTweets = 5000
+
+
+## Install snscrape from local git repo to make shure that it fits the used version.
+# If snscrape is already installed, uncomment the following lines:
+''' 
+import subprocess
+os.chdir('snscrape/')
+subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-e', '.'])
+os.chdir(wd) 
+'''
+
+