dane z plików - nie chodzi

2025-07-17 00:12:33 +02:00
parent 2a556781da
commit f9d76c85bd
5 changed files with 596 additions and 109 deletions
--- a/data/data_miner.py
+++ b/data/data_miner.py
@ -0,0 +1,102 @@
+import csv
+from datetime import datetime
+
+def filter_csv_by_date(input_file, output_file, start_date_str):
+    """
+    Reads a large CSV file line by line, filters by a start date,
+    and writes the results to a new file.
+
+    Args:
+        input_file (str): Path to the large input CSV.
+        output_file (str): Path to the output CSV file.
+        start_date_str (str): The start date in 'YYYY-MM-DD' format.
+    """
+    try:
+        # Convert the start date string into a datetime object for comparison
+        start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
+        print(f"Filtering for dates on or after {start_date_str}...")
+        print(f"Output will be saved to: {output_file}")
+
+
+        # Open the input and output files
+        with open(input_file, 'r', newline='') as infile, \
+             open(output_file, 'w', newline='') as outfile:
+
+            reader = csv.reader(infile)
+            writer = csv.writer(outfile)
+
+            # 1. Read and write the header
+            header = next(reader)
+            writer.writerow(header)
+
+            # Find the index of the 'Open time' column
+            try:
+                date_column_index = header.index('Open time')
+            except ValueError:
+                print("Error: 'Open time' column not found in the header.")
+                return
+
+            # 2. Process the rest of the file line by line
+            processed_lines = 0
+            written_lines = 0
+            for row in reader:
+                processed_lines += 1
+
+                # Avoid errors from empty or malformed rows
+                if not row:
+                    continue
+
+                try:
+                    # Get the date string from the correct column
+                    row_date_str = row[date_column_index]
+                    # Convert the row's date string to a datetime object
+                    row_date = datetime.strptime(row_date_str, '%Y-%m-%d %H:%M:%S')
+
+                    # 3. Compare dates and write to new file if it's a match
+                    if row_date >= start_date:
+                        writer.writerow(row)
+                        written_lines += 1
+
+                except (ValueError, IndexError) as e:
+                    # This will catch errors if a date is in the wrong format
+                    # or if a row doesn't have enough columns.
+                    print(f"Skipping malformed row {processed_lines + 1}: {row}. Error: {e}")
+                    continue
+                
+                # Optional: Print progress for very long operations
+                if processed_lines % 5000000 == 0:
+                    print(f"Processed {processed_lines:,} lines...")
+
+        print("\n--- Processing Complete ---")
+        print(f"Total lines processed: {processed_lines:,}")
+        print(f"Total lines written: {written_lines:,}")
+        print(f"Filtered data saved to: {output_file}")
+
+    except FileNotFoundError:
+        print(f"Error: The file '{input_file}' was not found.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+
+# --- Configuration ---
+# 1. Replace with the name of your large input file
+input_filename = 'ETHUSDT_1m_Binance.csv' 
+
+# 2. Provide the start date in YYYY-MM-DD format
+start_date_filter = '2025-07-01' # <-- REPLACE THIS
+
+# 3. The output filename is generated automatically in the requested format
+if start_date_filter != 'YYYY-MM-DD':
+    # This line removes the hyphens for the filename
+    filename_date_part = start_date_filter.replace('-', '')
+    output_filename = f'ETHUSDT_{filename_date_part}.csv'
+else:
+    output_filename = 'ETHUSDT_unfiltered.csv'
+
+
+# --- Run the script ---
+if start_date_filter == 'YYYY-MM-DD':
+    print("Please update the 'start_date_filter' variable in the script with a date like '2025-07-01'.")
+else:
+    filter_csv_by_date(input_filename, output_filename, start_date_filter)
+    
+