import pandas as pd import os def process_csv_in_directory(directory_path='.'): """ Finds all CSV files in a specified directory, removes duplicate rows, and saves the cleaned data to new files. Args: directory_path (str): The path to the directory containing the CSV files. Defaults to the current directory '.'. """ # 1. Get a list of all files in the specified directory try: all_files = os.listdir(directory_path) except FileNotFoundError: print(f"Error: The directory '{directory_path}' was not found.") return # 2. Filter the list to include only CSV files csv_files = [f for f in all_files if f.endswith('.csv')] if not csv_files: print("No CSV files found in the directory.") return print(f"Found {len(csv_files)} CSV files to process...\n") # 3. Loop through each CSV file and process it for filename in csv_files: file_path = os.path.join(directory_path, filename) try: # --- Step 1: Open the CSV file --- print(f"--- Processing file: {filename} ---") df = pd.read_csv(file_path) initial_rows = len(df) print(f"Initial rows: {initial_rows}") # --- Step 2: Remove doubled (duplicate) rows --- df.drop_duplicates(inplace=True) final_rows = len(df) # --- Step 3: Print summary --- duplicates_removed = initial_rows - final_rows print(f"Duplicate rows removed: {duplicates_removed}") print(f"Final rows: {final_rows}") # --- Step 4: Save the updated CSV file --- # Create a new filename to avoid overwriting the original new_filename = filename.replace('.csv', '_cleaned.csv') new_file_path = os.path.join(directory_path, new_filename) # Save the cleaned DataFrame to the new file # index=False prevents pandas from writing the DataFrame index as a column df.to_csv(new_file_path, index=False) print(f"Cleaned data saved to: '{new_filename}'\n") except Exception as e: print(f"Could not process {filename}. Error: {e}\n") # --- How to use it --- # Run the function on the current directory # To specify a different directory, pass it as an argument, # e.g., process_csv_in_directory('/path/to/your/files') process_csv_in_directory()