hyper/_data/candles/!clean_csv.py

import pandas as pd
import os

def process_csv_in_directory(directory_path='.'):
    """
    Finds all CSV files in a specified directory, removes duplicate rows,
    and saves the cleaned data to new files.

    Args:
        directory_path (str): The path to the directory containing the CSV files.
                              Defaults to the current directory '.'.
    """
    # 1. Get a list of all files in the specified directory
    try:
        all_files = os.listdir(directory_path)
    except FileNotFoundError:
        print(f"Error: The directory '{directory_path}' was not found.")
        return

    # 2. Filter the list to include only CSV files
    csv_files = [f for f in all_files if f.endswith('.csv')]

    if not csv_files:
        print("No CSV files found in the directory.")
        return

    print(f"Found {len(csv_files)} CSV files to process...\n")

    # 3. Loop through each CSV file and process it
    for filename in csv_files:
        file_path = os.path.join(directory_path, filename)

        try:
            # --- Step 1: Open the CSV file ---
            print(f"--- Processing file: {filename} ---")
            df = pd.read_csv(file_path)
            initial_rows = len(df)
            print(f"Initial rows: {initial_rows}")

            # --- Step 2: Remove doubled (duplicate) rows ---
            df.drop_duplicates(inplace=True)
            final_rows = len(df)

            # --- Step 3: Print summary ---
            duplicates_removed = initial_rows - final_rows
            print(f"Duplicate rows removed: {duplicates_removed}")
            print(f"Final rows: {final_rows}")

            # --- Step 4: Save the updated CSV file ---
            # Create a new filename to avoid overwriting the original
            new_filename = filename.replace('.csv', '_cleaned.csv')
            new_file_path = os.path.join(directory_path, new_filename)

            # Save the cleaned DataFrame to the new file
            # index=False prevents pandas from writing the DataFrame index as a column
            df.to_csv(new_file_path, index=False)
            print(f"Cleaned data saved to: '{new_filename}'\n")

        except Exception as e:
            print(f"Could not process {filename}. Error: {e}\n")

# --- How to use it ---
# Run the function on the current directory
# To specify a different directory, pass it as an argument,
# e.g., process_csv_in_directory('/path/to/your/files')
process_csv_in_directory()