66 lines
2.4 KiB
Python
66 lines
2.4 KiB
Python
import pandas as pd
|
|
import os
|
|
|
|
def process_csv_in_directory(directory_path='.'):
|
|
"""
|
|
Finds all CSV files in a specified directory, removes duplicate rows,
|
|
and saves the cleaned data to new files.
|
|
|
|
Args:
|
|
directory_path (str): The path to the directory containing the CSV files.
|
|
Defaults to the current directory '.'.
|
|
"""
|
|
# 1. Get a list of all files in the specified directory
|
|
try:
|
|
all_files = os.listdir(directory_path)
|
|
except FileNotFoundError:
|
|
print(f"Error: The directory '{directory_path}' was not found.")
|
|
return
|
|
|
|
# 2. Filter the list to include only CSV files
|
|
csv_files = [f for f in all_files if f.endswith('.csv')]
|
|
|
|
if not csv_files:
|
|
print("No CSV files found in the directory.")
|
|
return
|
|
|
|
print(f"Found {len(csv_files)} CSV files to process...\n")
|
|
|
|
# 3. Loop through each CSV file and process it
|
|
for filename in csv_files:
|
|
file_path = os.path.join(directory_path, filename)
|
|
|
|
try:
|
|
# --- Step 1: Open the CSV file ---
|
|
print(f"--- Processing file: {filename} ---")
|
|
df = pd.read_csv(file_path)
|
|
initial_rows = len(df)
|
|
print(f"Initial rows: {initial_rows}")
|
|
|
|
# --- Step 2: Remove doubled (duplicate) rows ---
|
|
df.drop_duplicates(inplace=True)
|
|
final_rows = len(df)
|
|
|
|
# --- Step 3: Print summary ---
|
|
duplicates_removed = initial_rows - final_rows
|
|
print(f"Duplicate rows removed: {duplicates_removed}")
|
|
print(f"Final rows: {final_rows}")
|
|
|
|
# --- Step 4: Save the updated CSV file ---
|
|
# Create a new filename to avoid overwriting the original
|
|
new_filename = filename.replace('.csv', '_cleaned.csv')
|
|
new_file_path = os.path.join(directory_path, new_filename)
|
|
|
|
# Save the cleaned DataFrame to the new file
|
|
# index=False prevents pandas from writing the DataFrame index as a column
|
|
df.to_csv(new_file_path, index=False)
|
|
print(f"Cleaned data saved to: '{new_filename}'\n")
|
|
|
|
except Exception as e:
|
|
print(f"Could not process {filename}. Error: {e}\n")
|
|
|
|
# --- How to use it ---
|
|
# Run the function on the current directory
|
|
# To specify a different directory, pass it as an argument,
|
|
# e.g., process_csv_in_directory('/path/to/your/files')
|
|
process_csv_in_directory() |