import csv from datetime import datetime def filter_csv_by_date(input_file, output_file, start_date_str): """ Reads a large CSV file line by line, filters by a start date, and writes the results to a new file. Args: input_file (str): Path to the large input CSV. output_file (str): Path to the output CSV file. start_date_str (str): The start date in 'YYYY-MM-DD' format. """ try: # Convert the start date string into a datetime object for comparison start_date = datetime.strptime(start_date_str, '%Y-%m-%d') print(f"Filtering for dates on or after {start_date_str}...") print(f"Output will be saved to: {output_file}") # Open the input and output files with open(input_file, 'r', newline='') as infile, \ open(output_file, 'w', newline='') as outfile: reader = csv.reader(infile) writer = csv.writer(outfile) # 1. Read and write the header header = next(reader) writer.writerow(header) # Find the index of the 'Open time' column try: date_column_index = header.index('Open time') except ValueError: print("Error: 'Open time' column not found in the header.") return # 2. Process the rest of the file line by line processed_lines = 0 written_lines = 0 for row in reader: processed_lines += 1 # Avoid errors from empty or malformed rows if not row: continue try: # Get the date string from the correct column row_date_str = row[date_column_index] # Convert the row's date string to a datetime object row_date = datetime.strptime(row_date_str, '%Y-%m-%d %H:%M:%S') # 3. Compare dates and write to new file if it's a match if row_date >= start_date: writer.writerow(row) written_lines += 1 except (ValueError, IndexError) as e: # This will catch errors if a date is in the wrong format # or if a row doesn't have enough columns. print(f"Skipping malformed row {processed_lines + 1}: {row}. Error: {e}") continue # Optional: Print progress for very long operations if processed_lines % 5000000 == 0: print(f"Processed {processed_lines:,} lines...") print("\n--- Processing Complete ---") print(f"Total lines processed: {processed_lines:,}") print(f"Total lines written: {written_lines:,}") print(f"Filtered data saved to: {output_file}") except FileNotFoundError: print(f"Error: The file '{input_file}' was not found.") except Exception as e: print(f"An unexpected error occurred: {e}") # --- Configuration --- # 1. Replace with the name of your large input file input_filename = 'ETHUSDT_1m_Binance.csv' # 2. Provide the start date in YYYY-MM-DD format start_date_filter = '2025-07-01' # <-- REPLACE THIS # 3. The output filename is generated automatically in the requested format if start_date_filter != 'YYYY-MM-DD': # This line removes the hyphens for the filename filename_date_part = start_date_filter.replace('-', '') output_filename = f'ETHUSDT_{filename_date_part}.csv' else: output_filename = 'ETHUSDT_unfiltered.csv' # --- Run the script --- if start_date_filter == 'YYYY-MM-DD': print("Please update the 'start_date_filter' variable in the script with a date like '2025-07-01'.") else: filter_csv_by_date(input_filename, output_filename, start_date_filter)