This commit is contained in:
Michael Beck
2025-01-31 00:31:25 +01:00
commit 026d6742db
12 changed files with 884 additions and 0 deletions

198
.gitignore vendored Normal file
View File

@@ -0,0 +1,198 @@
# Created by https://www.toptal.com/developers/gitignore/api/flask
# Edit at https://www.toptal.com/developers/gitignore?templates=flask
### Flask ###
instance/*
!instance/.gitignore
.webassets-cache
.env
### Flask.Python Stack ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# End of https://www.toptal.com/developers/gitignore/api/flask
# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode
# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
# Local History for Visual Studio Code
.history/
# Built Visual Studio Code Extensions
*.vsix
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide
# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode
# Exclude data files
*.csv
config.ini

61
README.md Normal file
View File

@@ -0,0 +1,61 @@
# Torn User Activity Scraper
This project is a web application that scrapes user activity data from the Torn API and displays the results. It includes features for starting and stopping the scraping process, viewing logs, and downloading results.
## Features
- Start and stop scraping user activity data
- View real-time logs
- Download data and log files
- View scraping results and statistics
## Requirements
- Python 3.8+
- Flask
- Flask-Bootstrap
- Flask-WTF
- Pandas
- Requests
## Installation
1. Clone the repository:
```sh
git clone https://github.com/yourusername/torn-user-activity-scraper.git
cd torn-user-activity-scraper
```
2. Create a virtual environment and activate it:
```sh
python3 -m venv venv
source venv/bin/activate
```
3. Install the required packages:
```sh
pip install -r requirements.txt
```
4. Set up your configuration file:
Create a `config.ini` file in the root directory of the project by renaming `example_config.ini` with the following content:
```ini
[DEFAULT]
SECRET_KEY = your_secret_key
API_KEY = your_api_key
```
## Usage
1. Run the Flask application:
```sh
flask run
```
2. Open your web browser and navigate to `http://127.0.0.1:5000/`.
## License
This project is licensed under the MIT License.

259
app.py Normal file
View File

@@ -0,0 +1,259 @@
from flask import Flask, request, render_template, Response, jsonify, url_for
from flask_bootstrap import Bootstrap5
from forms import ScrapingForm
import requests
import pandas as pd
import time
from datetime import datetime, timedelta
import threading
import logging
from logging.handlers import QueueHandler
from queue import Queue
import os
import glob
from datetime import datetime
from flask import send_from_directory
import configparser
app = Flask(__name__)
# Load configuration
config = configparser.ConfigParser()
config.read('config.ini')
app.config['SECRET_KEY'] = config['DEFAULT']['SECRET_KEY']
API_KEY = config['DEFAULT']['API_KEY']
bootstrap = Bootstrap5(app)
# Initialize the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) # Adjust as needed
# Make any logger.info() call go to both the log file and the queue.
# 1) FILE HANDLER
logFile = "log/" + datetime.now().strftime('%Y-%m-%d-%H-%M') + '.log'
file_handler = logging.FileHandler(logFile, mode='w')
file_handler.setLevel(logging.DEBUG) # or INFO, WARNING, etc.
formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
# 2) QUEUE HANDLER
log_queue = Queue()
queue_handler = QueueHandler(log_queue)
queue_handler.setLevel(logging.DEBUG)
logger.addHandler(queue_handler)
# Global state
scraping_active = False
scraping_thread = None
def fetch_faction_data(faction_id):
url = f"https://api.torn.com/faction/{faction_id}?selections=&key={API_KEY}"
response = requests.get(url)
if response.status_code == 200:
logger.info(f"Fetched data for faction ID {faction_id}")
return response.json()
else:
logger.warning(f"Failed to fetch faction data for faction ID {faction_id}")
return None
def fetch_user_activity(user_id):
url = f"https://api.torn.com/user/{user_id}?selections=basic,profile&key={API_KEY}"
response = requests.get(url)
if response.status_code == 200:
return response.json()
else:
logger.error(f"Failed to fetch user activity for user ID {user_id}")
return None
def scrape_data(faction_id, fetch_interval, run_interval):
global scraping_active
end_time = datetime.now() + timedelta(days=run_interval)
filename = f"data/{faction_id}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}.csv"
while datetime.now() < end_time and scraping_active:
logger.info(f"Fetching data at {datetime.now()}")
faction_data = fetch_faction_data(faction_id)
if faction_data and 'members' in faction_data:
user_activity_data = []
for user_id, user_info in faction_data['members'].items():
user_activity = fetch_user_activity(user_id)
if user_activity:
user_activity_data.append({
'user_id': user_id,
'name': user_activity.get('name', ''),
'last_action': user_activity.get('last_action', {}).get('timestamp', 0),
'status': user_activity.get('status', {}).get('state', ''),
'timestamp': datetime.now().timestamp()
})
logger.info(f"Fetched data for user {user_id} ({user_activity.get('name', '')})")
# Append data to the file
df = pd.DataFrame(user_activity_data)
df['last_action'] = pd.to_datetime(df['last_action'], unit='s')
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
if not os.path.isfile(filename):
df.to_csv(filename, index=False)
else:
df.to_csv(filename, mode='a', header=False, index=False)
logger.info(f"Data appended to {filename}")
time.sleep(fetch_interval)
else:
if datetime.now() < end_time:
logger.warning(f"Scraping stopped at {datetime.now()}")
elif scraping_active == False:
logger.warning(f"Scraping stopped at {datetime.now()} due to user request")
else:
logger.error(f"Scraping stopped due to timeout at {datetime.now()}")
logger.info("Scraping completed.")
scraping_active = False
def generate_statistics(df):
df['hour'] = df['timestamp'].dt.hour
activity_by_hour = df.groupby('hour').size()
return activity_by_hour
@app.route('/')
def index():
form = ScrapingForm()
return render_template('index.html', form=form)
@app.route('/start_scraping', methods=['POST'])
def start_scraping():
global scraping_active, scraping_thread
form = ScrapingForm()
if form.validate_on_submit():
if scraping_active:
logger.warning("Can't start scraping process: scraping already in progress")
return jsonify({"status": "Scraping already in progress"})
scraping_active = True
faction_id = form.faction_id.data
fetch_interval = form.fetch_interval.data
run_interval = form.run_interval.data
# Start scraping in a separate thread
scraping_thread = threading.Thread(target=scrape_data, args=(faction_id, fetch_interval, run_interval))
scraping_thread.daemon = True
scraping_thread.start()
return jsonify({"status": "Scraping started"})
return jsonify({"status": "Invalid form data"})
@app.route('/stop_scraping', methods=['POST'])
def stop_scraping():
global scraping_active
if not scraping_active:
return jsonify({"status": "No scraping in progress"})
scraping_active = False
logger.debug("scraping_active set to False")
return jsonify({"status": "Scraping stopped"})
@app.route('/scraping_status', methods=['GET'])
def scraping_status():
global scraping_active
logger.debug(f"scraping_status called: scraping_active = {scraping_active}")
return jsonify({"scraping_active": scraping_active})
@app.route('/logs')
def logs():
def generate():
while True:
if not log_queue.empty():
log = log_queue.get().getMessage()
yield f"data: {log}\n\n"
time.sleep(0.1)
return Response(generate(), mimetype='text/event-stream')
@app.route('/logfile', methods=['GET'])
def logfile():
lines = int(request.args.get('lines', 100)) # Number of lines to read
log_file_path = logFile # Path to the current log file
if not os.path.isfile(log_file_path):
return jsonify({"error": "Log file not found"}), 404
with open(log_file_path, 'r') as file:
log_lines = file.readlines()
return jsonify({"log": log_lines[-lines:]})
@app.route('/results')
def results():
# Assuming the scraping is done and data is saved somewhere
faction_id = request.args.get('faction_id')
filename = f"data/{faction_id}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}.csv"
if os.path.isfile(filename):
df = pd.read_csv(filename)
stats = generate_statistics(df)
return render_template('results.html', stats=stats.to_dict())
else:
return "No data found."
@app.route('/download_results')
def download_results():
data_files = glob.glob("data/*.csv")
log_files = glob.glob("log/*.log")
def get_file_info(file_path):
return {
"name": file_path,
"last_modified": os.path.getmtime(file_path),
"created": os.path.getctime(file_path),
"size": get_size(file_path)
}
data_files_info = [get_file_info(file) for file in data_files]
log_files_info = [get_file_info(file) for file in log_files]
files = {"data": data_files_info, "log": log_files_info}
return render_template('download_results.html', files=files)
@app.route('/delete_file', methods=['POST'])
def delete_file():
file_path = request.form.get('file_path')
if not file_path or not os.path.isfile(file_path):
return jsonify({"error": "File not found"}), 404
try:
os.remove(file_path)
return jsonify({"success": True}), 200
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.template_filter('datetimeformat')
def datetimeformat(value):
return datetime.fromtimestamp(value).strftime('%Y-%m-%d %H:%M:%S')
def get_size(path):
size = os.path.getsize(path)
if size < 1024:
return f"{size} bytes"
elif size < pow(1024,2):
return f"{round(size/1024, 2)} KB"
elif size < pow(1024,3):
return f"{round(size/(pow(1024,2)), 2)} MB"
elif size < pow(1024,4):
return f"{round(size/(pow(1024,3)), 2)} GB"
@app.route('/data/<path:filename>')
def download_data_file(filename):
return send_from_directory('data', filename)
@app.route('/logs/<path:filename>')
def download_log_file(filename):
return send_from_directory('logs', filename)
if __name__ == '__main__':
app.run(debug=True, threaded=True)

3
example_config.ini Normal file
View File

@@ -0,0 +1,3 @@
[DEFAULT]
SECRET_KEY = your_secret_key
API_KEY = your_api_key

9
forms.py Normal file
View File

@@ -0,0 +1,9 @@
from flask_wtf import FlaskForm
from wtforms import StringField, IntegerField, SubmitField
from wtforms.validators import DataRequired
class ScrapingForm(FlaskForm):
faction_id = StringField('Faction ID', validators=[DataRequired()], default='9686')
fetch_interval = IntegerField('Fetch Interval (seconds)', validators=[DataRequired()], default=60)
run_interval = IntegerField('Run Interval (days)', validators=[DataRequired()], default=1)
submit = SubmitField('Start Scraping')

87
requirements.txt Normal file
View File

@@ -0,0 +1,87 @@
appdirs==1.4.4
application-utility==1.3.3
attrs==23.2.1.dev0
autocommand==2.2.2
beautifulsoup4==4.12.3
btrfsutil==6.12
CacheControl==0.14.1
cachetools==5.5.0
certifi==2024.8.30
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.0
contourpy==1.3.1
coverage==7.6.8
cryptography==43.0.3
cssselect==1.2.0
cupshelpers==1.0
cycler==0.12.1
dbus-python==1.3.2
distro==1.9.0
docopt==0.6.2
filelock==3.16.1
fonttools==4.55.3
idna==3.10
inputs==0.5
jaraco.collections==5.0.1
jaraco.context==5.3.0
jaraco.functools==4.0.2
jaraco.text==4.0.0
keyutils==0.6
kiwisolver==1.4.5
lit==18.1.8.dev0
lockfile==0.12.2
lxml==5.3.0
Markdown==3.7
matplotlib==3.9.3
meson==1.6.0
moddb==0.11.0
more-itertools==10.3.0
msgpack==1.0.5
netsnmp-python==1.0a1
nftables==0.1
npyscreen==4.10.5
numpy==2.2.0
packaging==24.2
pacman_mirrors==4.27
pillow==11.0.0
platformdirs==4.3.6
ply==3.11
ProtonUp-Qt==2.10.0
pspdfutils==3.3.6
psutil==6.1.0
puremagic==1.28
pyaml==24.9.0
pycairo==1.27.0
pycparser==2.22
pycryptodomex==3.21.0
pycups==2.0.4
Pygments==2.18.0
PyGObject==3.50.0
pyparsing==3.1.2
pypdf==5.1.0
PyQt5==5.15.11
PyQt5_sip==12.16.1
pyserial==3.5
PySide6==6.8.1
pysmbc==1.0.25.1
python-dateutil==2.9.0
pyxdg==0.28
PyYAML==6.0.2
reportlab==4.2.2
requests==2.32.3
scour==0.38.2
setuptools==75.2.0
shiboken6==6.8.1
shiboken6-generator==6.8.1
six==1.16.0
smbus==1.1
soupsieve==2.6
steam==1.6.1
TBB==0.2
tqdm==4.67.1
udiskie==2.5.3
urllib3==1.26.20
vdf==4.0
wheel==0.45.0
zstandard==0.22.0

106
static/app.js Normal file
View File

@@ -0,0 +1,106 @@
document.addEventListener('DOMContentLoaded', () => {
const form = document.getElementById('scrapingForm');
const stopButton = document.getElementById('stopButton');
const logsElement = document.getElementById('logs');
const prevPageButton = document.getElementById('prevPage');
const nextPageButton = document.getElementById('nextPage');
let currentPage = 0;
const linesPerPage = 50;
let autoRefreshInterval;
console.log('Form:', form);
console.log('Submit button:', form.querySelector('button[type="submit"]'));
const fetchLogs = (page) => {
fetch(`/logfile?lines=${linesPerPage * (page + 1)}`)
.then(response => response.json())
.then(data => {
if (data.error) {
logsElement.textContent = data.error;
} else {
// Reverse the order of log lines
const reversedLogs = data.log.reverse();
logsElement.textContent = reversedLogs.join('');
}
});
};
const startAutoRefresh = () => {
autoRefreshInterval = setInterval(() => {
fetchLogs(currentPage);
}, 5000); // Refresh every 5 seconds
};
const stopAutoRefresh = () => {
clearInterval(autoRefreshInterval);
};
// Check scraping status on page load
fetch('/scraping_status')
.then(response => response.json())
.then(data => {
if (data.scraping_active) {
startButton.disabled = true;
stopButton.disabled = false;
startAutoRefresh(); // Start auto-refresh if scraping is active
} else {
startButton.disabled = false;
stopButton.disabled = true;
}
fetchLogs(currentPage);
});
prevPageButton.addEventListener('click', () => {
if (currentPage > 0) {
currentPage--;
fetchLogs(currentPage);
}
});
nextPageButton.addEventListener('click', () => {
currentPage++;
fetchLogs(currentPage);
});
form.addEventListener('submit', function(e) {
e.preventDefault();
const formData = new FormData(this);
fetch('/start_scraping', {
method: 'POST',
body: formData
}).then(response => response.json())
.then(data => {
console.log(data);
const submitButton = form.querySelector('button[type="submit"]');
if (data.status === "Scraping started") {
if (submitButton) {
submitButton.disabled = true;
}
stopButton.disabled = false;
startAutoRefresh(); // Start auto-refresh when scraping starts
} else {
// Handle errors or other statuses
}
});
});
stopButton.addEventListener('click', function() {
fetch('/stop_scraping', {
method: 'POST'
}).then(response => response.json())
.then(data => {
console.log(data);
const submitButton = form.querySelector('button[type="submit"]');
if (data.status === "Scraping stopped") {
if (submitButton) {
submitButton.disabled = false;
}
stopButton.disabled = true;
stopAutoRefresh(); // Stop auto-refresh when scraping stops
} else {
// Handle errors or other statuses
}
});
});
});

0
static/style.css Normal file
View File

23
templates/base.html Normal file
View File

@@ -0,0 +1,23 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Your page title</title>
<!-- Required meta tags -->
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
{{ bootstrap.load_css() }}
<link rel="stylesheet" href="{{url_for('.static', filename='styles.css')}}">
</head>
<body>
<header>
<h1>Torn User Activity Scraper</h1>
<nav class="navbar navbar-expand-lg navbar-light bg-light">
<div class="navbar-nav mr-auto">
{% from 'bootstrap4/nav.html' import render_nav_item %}
{{ render_nav_item('index', 'Home') }}
{{ render_nav_item('results', 'Results') }}
{{ render_nav_item('download_results', 'Download Results') }}
</div>
</nav>
</header>

View File

@@ -0,0 +1,52 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Your page title</title>
<!-- Required meta tags -->
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
{{ bootstrap.load_css() }}
<link rel="stylesheet" href="{{url_for('.static', filename='styles.css')}}">
</head>
<body>
<header>
<h1>Torn User Activity Scraper</h1>
<nav class="navbar navbar-expand-lg navbar-light bg-light">
<div class="navbar-nav mr-auto">
{% from 'bootstrap4/nav.html' import render_nav_item %}
{{ render_nav_item('index', 'Home') }}
{{ render_nav_item('results', 'Results') }}
{{ render_nav_item('download_results', 'Download Results') }}
</div>
</nav>
</header>
<main>
<section id="scrapingFormContainer" class="container-fluid d-flex justify-content-center">
<div class="container-md my-5 mx-2 shadow-lg p-4 ">
<h2>Available Files</h2>
<table class="table">
<thead>
<tr>
<td>Data</td>
<td>Logs</td>
</tr>
</thead>
<tbody>
{% for file in files %}
<tr>
<td><a href="{{ url_for('download_results', filename=file) }}">{{ file }}</a></td>
<td><a href="{{ url_for('download_logs', filename=file) }}">{{ file }}</a></td>
</tr>
{% endfor %}
</tbody>
</div>
</section>
</main>
{% block scripts %}
{{ bootstrap.load_js() }}
<script src="{{url_for('.static', filename='app.js')}}"></script>
{% endblock %}
</body>
</html>

64
templates/index.html Normal file
View File

@@ -0,0 +1,64 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Your page title</title>
<!-- Required meta tags -->
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
{{ bootstrap.load_css() }}
<link rel="stylesheet" href="{{url_for('.static', filename='styles.css')}}">
</head>
<body>
<header>
<h1>Torn User Activity Scraper</h1>
<nav class="navbar navbar-expand-lg navbar-light bg-light">
<div class="navbar-nav mr-auto">
{% from 'bootstrap4/nav.html' import render_nav_item %}
{{ render_nav_item('index', 'Home') }}
{{ render_nav_item('results', 'Results') }}
{{ render_nav_item('download_results', 'Download Results') }}
</div>
</nav>
</header>
<main>
<section id="scrapingFormContainer" class="container-fluid d-flex justify-content-center">
<div class="container-md my-5 mx-2 shadow-lg p-4 ">
<h2>Config</h2>
<form id="scrapingForm" method="POST" action="{{ url_for('start_scraping') }}">
{{ form.hidden_tag() }}
<div class="form-group">
{{ form.faction_id.label(class="form-control-label") }}
{{ form.faction_id(class="form-control") }}
</div>
<div class="form-group">
{{ form.fetch_interval.label(class="form-control-label") }}
{{ form.fetch_interval(class="form-control") }}
</div>
<div class="form-group">
{{ form.run_interval.label(class="form-control-label") }}
{{ form.run_interval(class="form-control") }}
</div>
<div class="form-group">
{{ form.submit(class="btn btn-primary", type="submit", id="startButton") }}
</div>
</form>
<button id="stopButton" class="btn btn-primary">Stop Scraping</button>
</div>
</section>
<section id="resultsContainer" class="container-fluid d-flex justify-content-center">
<div class="container-md my-5 mx-2 shadow-lg p-4">
<h2>Logs</h2>
<button id="prevPage">Previous</button>
<button id="nextPage">Next</button>
<pre class="pre-scrollable"><code id="logs"></code></pre>
</div>
</section>
</main>
{% block scripts %}
{{ bootstrap.load_js() }}
<script src="{{url_for('.static', filename='app.js')}}"></script>
{% endblock %}
</body>
</html>

22
templates/results.html Normal file
View File

@@ -0,0 +1,22 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Scraping Results</title>
</head>
<body>
<h1>User Activity Statistics</h1>
<table border="1">
<tr>
<th>Hour</th>
<th>Activity Count</th>
</tr>
{% for hour, count in stats.items() %}
<tr>
<td>{{ hour }}</td>
<td>{{ count }}</td>
</tr>
{% endfor %}
</table>
</body>
</html>