import os import pandas as pd import matplotlib matplotlib.use("Agg") # Prevents GUI-related issues in Flask import matplotlib.pyplot as plt import seaborn as sns def load_data(file_path: str) -> pd.DataFrame: """Loads the scraped data from a CSV file into a Pandas DataFrame.""" if not os.path.exists(file_path): raise FileNotFoundError(f"File {file_path} not found.") df = pd.read_csv(file_path) df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce") df["last_action"] = pd.to_datetime(df["last_action"], errors="coerce") return df def generate_statistics(df: pd.DataFrame): """Generates activity statistics grouped by hour.""" df["hour"] = df["timestamp"].dt.hour return df.groupby("hour").size() def plot_activity_distribution(df: pd.DataFrame, output_path="activity_distribution.png"): """Plots user activity distribution and saves the figure.""" # Ensure the directory exists static_dir = os.path.join("app", "static", "plots") output_path = os.path.join(static_dir, output_path) os.makedirs(static_dir, exist_ok=True) # Convert timestamp column to datetime (if not already) if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]): df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce") df["hour"] = df["timestamp"].dt.hour activity_counts = df.groupby("hour").size().reset_index(name="count") # Use non-GUI backend for Matplotlib plt.figure(figsize=(10, 5)) # Fix Seaborn Warning: Assign `hue` explicitly sns.barplot(x="hour", y="count", data=activity_counts, hue="hour", palette="Blues", legend=False) plt.xlabel("Hour of the Day") plt.ylabel("Activity Count") plt.title("User Activity Distribution") plt.xticks(range(0, 24)) # Save the plot file safely plt.savefig(output_path, bbox_inches="tight") plt.close() # Verify the file exists after saving if not os.path.exists(output_path): raise FileNotFoundError(f"Plot could not be saved to {output_path}") return output_path