Files
hyper/_data/candles/!clean_csv.py
2025-10-13 11:28:09 +02:00

66 lines
2.4 KiB
Python

import pandas as pd
import os
def process_csv_in_directory(directory_path='.'):
"""
Finds all CSV files in a specified directory, removes duplicate rows,
and saves the cleaned data to new files.
Args:
directory_path (str): The path to the directory containing the CSV files.
Defaults to the current directory '.'.
"""
# 1. Get a list of all files in the specified directory
try:
all_files = os.listdir(directory_path)
except FileNotFoundError:
print(f"Error: The directory '{directory_path}' was not found.")
return
# 2. Filter the list to include only CSV files
csv_files = [f for f in all_files if f.endswith('.csv')]
if not csv_files:
print("No CSV files found in the directory.")
return
print(f"Found {len(csv_files)} CSV files to process...\n")
# 3. Loop through each CSV file and process it
for filename in csv_files:
file_path = os.path.join(directory_path, filename)
try:
# --- Step 1: Open the CSV file ---
print(f"--- Processing file: {filename} ---")
df = pd.read_csv(file_path)
initial_rows = len(df)
print(f"Initial rows: {initial_rows}")
# --- Step 2: Remove doubled (duplicate) rows ---
df.drop_duplicates(inplace=True)
final_rows = len(df)
# --- Step 3: Print summary ---
duplicates_removed = initial_rows - final_rows
print(f"Duplicate rows removed: {duplicates_removed}")
print(f"Final rows: {final_rows}")
# --- Step 4: Save the updated CSV file ---
# Create a new filename to avoid overwriting the original
new_filename = filename.replace('.csv', '_cleaned.csv')
new_file_path = os.path.join(directory_path, new_filename)
# Save the cleaned DataFrame to the new file
# index=False prevents pandas from writing the DataFrame index as a column
df.to_csv(new_file_path, index=False)
print(f"Cleaned data saved to: '{new_filename}'\n")
except Exception as e:
print(f"Could not process {filename}. Error: {e}\n")
# --- How to use it ---
# Run the function on the current directory
# To specify a different directory, pass it as an argument,
# e.g., process_csv_in_directory('/path/to/your/files')
process_csv_in_directory()