import pandas as pd import json import os # Define input and output file paths input_path = "/cosma7/data/www/publicdata/grafanadata/ArepoBenchmark.log" output_path = "/cosma7/data/www/publicdata/grafanadata/ArepoBenchmark_cleaned.csv" # Read the CSV, skipping problematic lines if needed df = pd.read_csv(input_path, on_bad_lines='skip') # Function to safely parse JSON-ish strings def safe_json_parse(s): try: return json.loads(s.replace("'", '"').replace(";", ",")) except Exception: return {} # Containers for extracted fields omp_threads = [] compiler = [] fftw = [] # Extract relevant fields from complex columns for _, row in df.iterrows(): env_vars = safe_json_parse(str(row.get("env_vars", "{}"))) spack_dict = safe_json_parse(str(row.get("spack_spec_dict", "{}"))) omp_threads.append(env_vars.get("OMP_NUM_THREADS", None)) arepo_data = spack_dict.get("arepo", {}) compiler.append(arepo_data.get("compiler", {}).get("name", None)) fftw.append(arepo_data.get("variants", {}).get("fftw", None)) # Construct cleaned DataFrame clean_df = pd.DataFrame({ "job_completion_time": df["job_completion_time"], "duration_value": df["duration_value"], "OMP_NUM_THREADS": omp_threads, "compiler": compiler, "fftw": fftw }) # Save the cleaned data clean_df.to_csv(output_path, index=False) print(f"✅ Cleaned CSV saved at: {output_path}")