From 026d6742db956930e6c2f0764a54d15680c8b1bc Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Fri, 31 Jan 2025 00:31:25 +0100 Subject: [PATCH] init --- .gitignore | 198 ++++++++++++++++++++++++ README.md | 61 ++++++++ app.py | 259 ++++++++++++++++++++++++++++++++ example_config.ini | 3 + forms.py | 9 ++ requirements.txt | 87 +++++++++++ static/app.js | 106 +++++++++++++ static/style.css | 0 templates/base.html | 23 +++ templates/download_results.html | 52 +++++++ templates/index.html | 64 ++++++++ templates/results.html | 22 +++ 12 files changed, 884 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 app.py create mode 100644 example_config.ini create mode 100644 forms.py create mode 100644 requirements.txt create mode 100644 static/app.js create mode 100644 static/style.css create mode 100644 templates/base.html create mode 100644 templates/download_results.html create mode 100644 templates/index.html create mode 100644 templates/results.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0365968 --- /dev/null +++ b/.gitignore @@ -0,0 +1,198 @@ +# Created by https://www.toptal.com/developers/gitignore/api/flask +# Edit at https://www.toptal.com/developers/gitignore?templates=flask + +### Flask ### +instance/* +!instance/.gitignore +.webassets-cache +.env + +### Flask.Python Stack ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# End of https://www.toptal.com/developers/gitignore/api/flask + +# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode +# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode + +# Exclude data files +*.csv +config.ini \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..4cfef25 --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +# Torn User Activity Scraper + +This project is a web application that scrapes user activity data from the Torn API and displays the results. It includes features for starting and stopping the scraping process, viewing logs, and downloading results. + +## Features + +- Start and stop scraping user activity data +- View real-time logs +- Download data and log files +- View scraping results and statistics + +## Requirements + +- Python 3.8+ +- Flask +- Flask-Bootstrap +- Flask-WTF +- Pandas +- Requests + +## Installation + +1. Clone the repository: + +```sh +git clone https://github.com/yourusername/torn-user-activity-scraper.git +cd torn-user-activity-scraper +``` + +2. Create a virtual environment and activate it: +```sh +python3 -m venv venv +source venv/bin/activate +``` + +3. Install the required packages: +```sh +pip install -r requirements.txt +``` + +4. Set up your configuration file: +Create a `config.ini` file in the root directory of the project by renaming `example_config.ini` with the following content: + +```ini +[DEFAULT] +SECRET_KEY = your_secret_key +API_KEY = your_api_key +``` + +## Usage + +1. Run the Flask application: +```sh +flask run +``` + +2. Open your web browser and navigate to `http://127.0.0.1:5000/`. + +## License + +This project is licensed under the MIT License. diff --git a/app.py b/app.py new file mode 100644 index 0000000..b374eaa --- /dev/null +++ b/app.py @@ -0,0 +1,259 @@ +from flask import Flask, request, render_template, Response, jsonify, url_for +from flask_bootstrap import Bootstrap5 +from forms import ScrapingForm +import requests +import pandas as pd +import time +from datetime import datetime, timedelta +import threading +import logging +from logging.handlers import QueueHandler +from queue import Queue +import os +import glob +from datetime import datetime +from flask import send_from_directory +import configparser + +app = Flask(__name__) + +# Load configuration +config = configparser.ConfigParser() +config.read('config.ini') + +app.config['SECRET_KEY'] = config['DEFAULT']['SECRET_KEY'] +API_KEY = config['DEFAULT']['API_KEY'] + +bootstrap = Bootstrap5(app) + +# Initialize the logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) # Adjust as needed + +# Make any logger.info() call go to both the log file and the queue. +# 1) FILE HANDLER +logFile = "log/" + datetime.now().strftime('%Y-%m-%d-%H-%M') + '.log' +file_handler = logging.FileHandler(logFile, mode='w') +file_handler.setLevel(logging.DEBUG) # or INFO, WARNING, etc. +formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s', + datefmt='%m/%d/%Y %I:%M:%S %p') +file_handler.setFormatter(formatter) + +logger.addHandler(file_handler) + +# 2) QUEUE HANDLER +log_queue = Queue() +queue_handler = QueueHandler(log_queue) +queue_handler.setLevel(logging.DEBUG) +logger.addHandler(queue_handler) + +# Global state +scraping_active = False +scraping_thread = None + +def fetch_faction_data(faction_id): + url = f"https://api.torn.com/faction/{faction_id}?selections=&key={API_KEY}" + response = requests.get(url) + if response.status_code == 200: + logger.info(f"Fetched data for faction ID {faction_id}") + return response.json() + else: + logger.warning(f"Failed to fetch faction data for faction ID {faction_id}") + return None + +def fetch_user_activity(user_id): + url = f"https://api.torn.com/user/{user_id}?selections=basic,profile&key={API_KEY}" + response = requests.get(url) + if response.status_code == 200: + return response.json() + else: + logger.error(f"Failed to fetch user activity for user ID {user_id}") + return None + +def scrape_data(faction_id, fetch_interval, run_interval): + global scraping_active + end_time = datetime.now() + timedelta(days=run_interval) + filename = f"data/{faction_id}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}.csv" + + while datetime.now() < end_time and scraping_active: + logger.info(f"Fetching data at {datetime.now()}") + faction_data = fetch_faction_data(faction_id) + if faction_data and 'members' in faction_data: + user_activity_data = [] + for user_id, user_info in faction_data['members'].items(): + user_activity = fetch_user_activity(user_id) + if user_activity: + user_activity_data.append({ + 'user_id': user_id, + 'name': user_activity.get('name', ''), + 'last_action': user_activity.get('last_action', {}).get('timestamp', 0), + 'status': user_activity.get('status', {}).get('state', ''), + 'timestamp': datetime.now().timestamp() + }) + logger.info(f"Fetched data for user {user_id} ({user_activity.get('name', '')})") + + # Append data to the file + df = pd.DataFrame(user_activity_data) + df['last_action'] = pd.to_datetime(df['last_action'], unit='s') + df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') + + if not os.path.isfile(filename): + df.to_csv(filename, index=False) + else: + df.to_csv(filename, mode='a', header=False, index=False) + + logger.info(f"Data appended to {filename}") + + time.sleep(fetch_interval) + else: + if datetime.now() < end_time: + logger.warning(f"Scraping stopped at {datetime.now()}") + elif scraping_active == False: + logger.warning(f"Scraping stopped at {datetime.now()} due to user request") + else: + logger.error(f"Scraping stopped due to timeout at {datetime.now()}") + logger.info("Scraping completed.") + scraping_active = False + +def generate_statistics(df): + df['hour'] = df['timestamp'].dt.hour + activity_by_hour = df.groupby('hour').size() + return activity_by_hour + +@app.route('/') +def index(): + form = ScrapingForm() + return render_template('index.html', form=form) + +@app.route('/start_scraping', methods=['POST']) +def start_scraping(): + global scraping_active, scraping_thread + form = ScrapingForm() + if form.validate_on_submit(): + if scraping_active: + logger.warning("Can't start scraping process: scraping already in progress") + return jsonify({"status": "Scraping already in progress"}) + + scraping_active = True + + faction_id = form.faction_id.data + fetch_interval = form.fetch_interval.data + run_interval = form.run_interval.data + + # Start scraping in a separate thread + scraping_thread = threading.Thread(target=scrape_data, args=(faction_id, fetch_interval, run_interval)) + scraping_thread.daemon = True + scraping_thread.start() + + return jsonify({"status": "Scraping started"}) + return jsonify({"status": "Invalid form data"}) + +@app.route('/stop_scraping', methods=['POST']) +def stop_scraping(): + global scraping_active + if not scraping_active: + return jsonify({"status": "No scraping in progress"}) + + scraping_active = False + logger.debug("scraping_active set to False") + return jsonify({"status": "Scraping stopped"}) + +@app.route('/scraping_status', methods=['GET']) +def scraping_status(): + global scraping_active + logger.debug(f"scraping_status called: scraping_active = {scraping_active}") + return jsonify({"scraping_active": scraping_active}) + +@app.route('/logs') +def logs(): + def generate(): + while True: + if not log_queue.empty(): + log = log_queue.get().getMessage() + yield f"data: {log}\n\n" + time.sleep(0.1) + return Response(generate(), mimetype='text/event-stream') + +@app.route('/logfile', methods=['GET']) +def logfile(): + lines = int(request.args.get('lines', 100)) # Number of lines to read + log_file_path = logFile # Path to the current log file + + if not os.path.isfile(log_file_path): + return jsonify({"error": "Log file not found"}), 404 + + with open(log_file_path, 'r') as file: + log_lines = file.readlines() + + return jsonify({"log": log_lines[-lines:]}) + +@app.route('/results') +def results(): + # Assuming the scraping is done and data is saved somewhere + faction_id = request.args.get('faction_id') + filename = f"data/{faction_id}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}.csv" + if os.path.isfile(filename): + df = pd.read_csv(filename) + stats = generate_statistics(df) + return render_template('results.html', stats=stats.to_dict()) + else: + return "No data found." + +@app.route('/download_results') +def download_results(): + data_files = glob.glob("data/*.csv") + log_files = glob.glob("log/*.log") + + def get_file_info(file_path): + return { + "name": file_path, + "last_modified": os.path.getmtime(file_path), + "created": os.path.getctime(file_path), + "size": get_size(file_path) + } + + data_files_info = [get_file_info(file) for file in data_files] + log_files_info = [get_file_info(file) for file in log_files] + + files = {"data": data_files_info, "log": log_files_info} + return render_template('download_results.html', files=files) + +@app.route('/delete_file', methods=['POST']) +def delete_file(): + file_path = request.form.get('file_path') + + if not file_path or not os.path.isfile(file_path): + return jsonify({"error": "File not found"}), 404 + + try: + os.remove(file_path) + return jsonify({"success": True}), 200 + except Exception as e: + return jsonify({"error": str(e)}), 500 + +@app.template_filter('datetimeformat') +def datetimeformat(value): + return datetime.fromtimestamp(value).strftime('%Y-%m-%d %H:%M:%S') + +def get_size(path): + size = os.path.getsize(path) + if size < 1024: + return f"{size} bytes" + elif size < pow(1024,2): + return f"{round(size/1024, 2)} KB" + elif size < pow(1024,3): + return f"{round(size/(pow(1024,2)), 2)} MB" + elif size < pow(1024,4): + return f"{round(size/(pow(1024,3)), 2)} GB" + + +@app.route('/data/') +def download_data_file(filename): + return send_from_directory('data', filename) + +@app.route('/logs/') +def download_log_file(filename): + return send_from_directory('logs', filename) + +if __name__ == '__main__': + app.run(debug=True, threaded=True) \ No newline at end of file diff --git a/example_config.ini b/example_config.ini new file mode 100644 index 0000000..50655ba --- /dev/null +++ b/example_config.ini @@ -0,0 +1,3 @@ +[DEFAULT] +SECRET_KEY = your_secret_key +API_KEY = your_api_key diff --git a/forms.py b/forms.py new file mode 100644 index 0000000..b311862 --- /dev/null +++ b/forms.py @@ -0,0 +1,9 @@ +from flask_wtf import FlaskForm +from wtforms import StringField, IntegerField, SubmitField +from wtforms.validators import DataRequired + +class ScrapingForm(FlaskForm): + faction_id = StringField('Faction ID', validators=[DataRequired()], default='9686') + fetch_interval = IntegerField('Fetch Interval (seconds)', validators=[DataRequired()], default=60) + run_interval = IntegerField('Run Interval (days)', validators=[DataRequired()], default=1) + submit = SubmitField('Start Scraping') \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a2843d2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,87 @@ +appdirs==1.4.4 +application-utility==1.3.3 +attrs==23.2.1.dev0 +autocommand==2.2.2 +beautifulsoup4==4.12.3 +btrfsutil==6.12 +CacheControl==0.14.1 +cachetools==5.5.0 +certifi==2024.8.30 +cffi==1.17.1 +chardet==5.2.0 +charset-normalizer==3.4.0 +contourpy==1.3.1 +coverage==7.6.8 +cryptography==43.0.3 +cssselect==1.2.0 +cupshelpers==1.0 +cycler==0.12.1 +dbus-python==1.3.2 +distro==1.9.0 +docopt==0.6.2 +filelock==3.16.1 +fonttools==4.55.3 +idna==3.10 +inputs==0.5 +jaraco.collections==5.0.1 +jaraco.context==5.3.0 +jaraco.functools==4.0.2 +jaraco.text==4.0.0 +keyutils==0.6 +kiwisolver==1.4.5 +lit==18.1.8.dev0 +lockfile==0.12.2 +lxml==5.3.0 +Markdown==3.7 +matplotlib==3.9.3 +meson==1.6.0 +moddb==0.11.0 +more-itertools==10.3.0 +msgpack==1.0.5 +netsnmp-python==1.0a1 +nftables==0.1 +npyscreen==4.10.5 +numpy==2.2.0 +packaging==24.2 +pacman_mirrors==4.27 +pillow==11.0.0 +platformdirs==4.3.6 +ply==3.11 +ProtonUp-Qt==2.10.0 +pspdfutils==3.3.6 +psutil==6.1.0 +puremagic==1.28 +pyaml==24.9.0 +pycairo==1.27.0 +pycparser==2.22 +pycryptodomex==3.21.0 +pycups==2.0.4 +Pygments==2.18.0 +PyGObject==3.50.0 +pyparsing==3.1.2 +pypdf==5.1.0 +PyQt5==5.15.11 +PyQt5_sip==12.16.1 +pyserial==3.5 +PySide6==6.8.1 +pysmbc==1.0.25.1 +python-dateutil==2.9.0 +pyxdg==0.28 +PyYAML==6.0.2 +reportlab==4.2.2 +requests==2.32.3 +scour==0.38.2 +setuptools==75.2.0 +shiboken6==6.8.1 +shiboken6-generator==6.8.1 +six==1.16.0 +smbus==1.1 +soupsieve==2.6 +steam==1.6.1 +TBB==0.2 +tqdm==4.67.1 +udiskie==2.5.3 +urllib3==1.26.20 +vdf==4.0 +wheel==0.45.0 +zstandard==0.22.0 diff --git a/static/app.js b/static/app.js new file mode 100644 index 0000000..7244679 --- /dev/null +++ b/static/app.js @@ -0,0 +1,106 @@ +document.addEventListener('DOMContentLoaded', () => { + const form = document.getElementById('scrapingForm'); + const stopButton = document.getElementById('stopButton'); + const logsElement = document.getElementById('logs'); + const prevPageButton = document.getElementById('prevPage'); + const nextPageButton = document.getElementById('nextPage'); + let currentPage = 0; + const linesPerPage = 50; + let autoRefreshInterval; + + console.log('Form:', form); + console.log('Submit button:', form.querySelector('button[type="submit"]')); + + const fetchLogs = (page) => { + fetch(`/logfile?lines=${linesPerPage * (page + 1)}`) + .then(response => response.json()) + .then(data => { + if (data.error) { + logsElement.textContent = data.error; + } else { + // Reverse the order of log lines + const reversedLogs = data.log.reverse(); + logsElement.textContent = reversedLogs.join(''); + } + }); + }; + + const startAutoRefresh = () => { + autoRefreshInterval = setInterval(() => { + fetchLogs(currentPage); + }, 5000); // Refresh every 5 seconds + }; + + const stopAutoRefresh = () => { + clearInterval(autoRefreshInterval); + }; + + // Check scraping status on page load + fetch('/scraping_status') + .then(response => response.json()) + .then(data => { + if (data.scraping_active) { + startButton.disabled = true; + stopButton.disabled = false; + startAutoRefresh(); // Start auto-refresh if scraping is active + } else { + startButton.disabled = false; + stopButton.disabled = true; + } + fetchLogs(currentPage); + }); + + prevPageButton.addEventListener('click', () => { + if (currentPage > 0) { + currentPage--; + fetchLogs(currentPage); + } + }); + + nextPageButton.addEventListener('click', () => { + currentPage++; + fetchLogs(currentPage); + }); + + form.addEventListener('submit', function(e) { + e.preventDefault(); + const formData = new FormData(this); + + fetch('/start_scraping', { + method: 'POST', + body: formData + }).then(response => response.json()) + .then(data => { + console.log(data); + const submitButton = form.querySelector('button[type="submit"]'); + if (data.status === "Scraping started") { + if (submitButton) { + submitButton.disabled = true; + } + stopButton.disabled = false; + startAutoRefresh(); // Start auto-refresh when scraping starts + } else { + // Handle errors or other statuses + } + }); + }); + + stopButton.addEventListener('click', function() { + fetch('/stop_scraping', { + method: 'POST' + }).then(response => response.json()) + .then(data => { + console.log(data); + const submitButton = form.querySelector('button[type="submit"]'); + if (data.status === "Scraping stopped") { + if (submitButton) { + submitButton.disabled = false; + } + stopButton.disabled = true; + stopAutoRefresh(); // Stop auto-refresh when scraping stops + } else { + // Handle errors or other statuses + } + }); + }); +}); \ No newline at end of file diff --git a/static/style.css b/static/style.css new file mode 100644 index 0000000..e69de29 diff --git a/templates/base.html b/templates/base.html new file mode 100644 index 0000000..a3245a6 --- /dev/null +++ b/templates/base.html @@ -0,0 +1,23 @@ + + + + + Your page title + + + + {{ bootstrap.load_css() }} + + + +
+

Torn User Activity Scraper

+ +
diff --git a/templates/download_results.html b/templates/download_results.html new file mode 100644 index 0000000..43453f8 --- /dev/null +++ b/templates/download_results.html @@ -0,0 +1,52 @@ + + + + + Your page title + + + + {{ bootstrap.load_css() }} + + + +
+

Torn User Activity Scraper

+ +
+ +
+
+
+

Available Files

+ + + + + + + + + {% for file in files %} + + + + + {% endfor %} + + + + + {% block scripts %} + {{ bootstrap.load_js() }} + + {% endblock %} + + \ No newline at end of file diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..be09d44 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,64 @@ + + + + + Your page title + + + + {{ bootstrap.load_css() }} + + + +
+

Torn User Activity Scraper

+ +
+
+
+
+

Config

+
+ {{ form.hidden_tag() }} +
+ {{ form.faction_id.label(class="form-control-label") }} + {{ form.faction_id(class="form-control") }} +
+
+ {{ form.fetch_interval.label(class="form-control-label") }} + {{ form.fetch_interval(class="form-control") }} +
+
+ {{ form.run_interval.label(class="form-control-label") }} + {{ form.run_interval(class="form-control") }} +
+
+ {{ form.submit(class="btn btn-primary", type="submit", id="startButton") }} +
+ + +
+
+ +
+
+

Logs

+ + +
+
+
+
+ {% block scripts %} + {{ bootstrap.load_js() }} + + {% endblock %} + + \ No newline at end of file diff --git a/templates/results.html b/templates/results.html new file mode 100644 index 0000000..a9fb16a --- /dev/null +++ b/templates/results.html @@ -0,0 +1,22 @@ + + + + + Scraping Results + + +

User Activity Statistics

+
DataLogs
{{ file }}{{ file }}
+ + + + + {% for hour, count in stats.items() %} + + + + + {% endfor %} +
HourActivity Count
{{ hour }}{{ count }}
+ +