dane z plików - nie chodzi
This commit is contained in:
102
data/data_miner.py
Normal file
102
data/data_miner.py
Normal file
@ -0,0 +1,102 @@
|
||||
import csv
|
||||
from datetime import datetime
|
||||
|
||||
def filter_csv_by_date(input_file, output_file, start_date_str):
|
||||
"""
|
||||
Reads a large CSV file line by line, filters by a start date,
|
||||
and writes the results to a new file.
|
||||
|
||||
Args:
|
||||
input_file (str): Path to the large input CSV.
|
||||
output_file (str): Path to the output CSV file.
|
||||
start_date_str (str): The start date in 'YYYY-MM-DD' format.
|
||||
"""
|
||||
try:
|
||||
# Convert the start date string into a datetime object for comparison
|
||||
start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
|
||||
print(f"Filtering for dates on or after {start_date_str}...")
|
||||
print(f"Output will be saved to: {output_file}")
|
||||
|
||||
|
||||
# Open the input and output files
|
||||
with open(input_file, 'r', newline='') as infile, \
|
||||
open(output_file, 'w', newline='') as outfile:
|
||||
|
||||
reader = csv.reader(infile)
|
||||
writer = csv.writer(outfile)
|
||||
|
||||
# 1. Read and write the header
|
||||
header = next(reader)
|
||||
writer.writerow(header)
|
||||
|
||||
# Find the index of the 'Open time' column
|
||||
try:
|
||||
date_column_index = header.index('Open time')
|
||||
except ValueError:
|
||||
print("Error: 'Open time' column not found in the header.")
|
||||
return
|
||||
|
||||
# 2. Process the rest of the file line by line
|
||||
processed_lines = 0
|
||||
written_lines = 0
|
||||
for row in reader:
|
||||
processed_lines += 1
|
||||
|
||||
# Avoid errors from empty or malformed rows
|
||||
if not row:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Get the date string from the correct column
|
||||
row_date_str = row[date_column_index]
|
||||
# Convert the row's date string to a datetime object
|
||||
row_date = datetime.strptime(row_date_str, '%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 3. Compare dates and write to new file if it's a match
|
||||
if row_date >= start_date:
|
||||
writer.writerow(row)
|
||||
written_lines += 1
|
||||
|
||||
except (ValueError, IndexError) as e:
|
||||
# This will catch errors if a date is in the wrong format
|
||||
# or if a row doesn't have enough columns.
|
||||
print(f"Skipping malformed row {processed_lines + 1}: {row}. Error: {e}")
|
||||
continue
|
||||
|
||||
# Optional: Print progress for very long operations
|
||||
if processed_lines % 5000000 == 0:
|
||||
print(f"Processed {processed_lines:,} lines...")
|
||||
|
||||
print("\n--- Processing Complete ---")
|
||||
print(f"Total lines processed: {processed_lines:,}")
|
||||
print(f"Total lines written: {written_lines:,}")
|
||||
print(f"Filtered data saved to: {output_file}")
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"Error: The file '{input_file}' was not found.")
|
||||
except Exception as e:
|
||||
print(f"An unexpected error occurred: {e}")
|
||||
|
||||
# --- Configuration ---
|
||||
# 1. Replace with the name of your large input file
|
||||
input_filename = 'ETHUSDT_1m_Binance.csv'
|
||||
|
||||
# 2. Provide the start date in YYYY-MM-DD format
|
||||
start_date_filter = '2025-07-01' # <-- REPLACE THIS
|
||||
|
||||
# 3. The output filename is generated automatically in the requested format
|
||||
if start_date_filter != 'YYYY-MM-DD':
|
||||
# This line removes the hyphens for the filename
|
||||
filename_date_part = start_date_filter.replace('-', '')
|
||||
output_filename = f'ETHUSDT_{filename_date_part}.csv'
|
||||
else:
|
||||
output_filename = 'ETHUSDT_unfiltered.csv'
|
||||
|
||||
|
||||
# --- Run the script ---
|
||||
if start_date_filter == 'YYYY-MM-DD':
|
||||
print("Please update the 'start_date_filter' variable in the script with a date like '2025-07-01'.")
|
||||
else:
|
||||
filter_csv_by_date(input_filename, output_filename, start_date_filter)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user