import pandas as pd import numpy as np import os import scipy.ndimage from config import Forward_Model_Config as cfg from config import Path_Config as pcfg # Parameters data_per_ns = cfg.data_per_ns # data grid num per ns DATA_LENGTH = cfg.data_length # Target length after interpolation (unit: grid number) THRESHOLD = cfg.filter_threthold # Minimum absolute value threshold for valid data # Input and output paths INPUT_DATA_FOLDER = pcfg.INPUT_DATA_FOLDER INPUT_LABEL_FOLDER = pcfg.INPUT_LABEL_FOLDER DATA_OUTPUT_FILE = pcfg.dataset_path LABEL_OUTPUT_FILE = pcfg.labelset_path # Get all data and label filenames data_files = sorted([f for f in os.listdir(INPUT_DATA_FOLDER) if f.endswith('.txt')]) label_files = sorted([f for f in os.listdir(INPUT_LABEL_FOLDER) if f.endswith('.csv')]) # Extract valid file IDs data_ids = set(os.path.splitext(f)[0] for f in data_files) label_ids = set(os.path.splitext(f)[0] for f in label_files) valid_ids = sorted(data_ids & label_ids) # Only process matching data-label pairs # Storage for processed data and labels all_data = [] all_labels = [] for data_id in valid_ids: try: # Define file paths data_path = os.path.join(INPUT_DATA_FOLDER, data_id + ".txt") label_path = os.path.join(INPUT_LABEL_FOLDER, data_id + ".csv") # Load data file (1D signal data) raw_data = np.loadtxt(data_path, delimiter=",", skiprows=0) processed_data = raw_data[:] # Remove direct wave component # Interpolate data to the target length processed_data = scipy.ndimage.zoom(processed_data, DATA_LENGTH / processed_data.shape[0], order=1) # Skip data if the maximum absolute value is below the threshold if np.max(np.abs(processed_data)) < THRESHOLD: print(f"Skipping {data_id} due to low signal amplitude.") continue # Append processed data all_data.append(processed_data) # Load label file and remove the first column label = np.delete(np.loadtxt(label_path, delimiter=",", skiprows=0), [0], axis=0) all_labels.append(label) except Exception as e: print(f"Error processing file {data_id}: {e}") continue # Convert lists to 2D arrays combined_data = np.column_stack(all_data) if all_data else np.array([]) combined_labels = np.column_stack(all_labels) if all_labels else np.array([]) # Save processed data if combined_data.size > 0: data_header = np.array([int(id_) for id_ in valid_ids]).reshape(1, -1) if data_header.shape[1] != combined_data.shape[1]: print(f"Warning: Mismatch between data header ({data_header.shape[1]}) and data columns ({combined_data.shape[1]}). Trimming header.") data_header = data_header[:, :combined_data.shape[1]] combined_data_with_header = np.vstack((data_header, combined_data)) np.savetxt(DATA_OUTPUT_FILE, combined_data_with_header, delimiter=",", fmt="%g") print(f"Processed data saved to {DATA_OUTPUT_FILE}") else: print("No valid data files found.") # Save processed labels if combined_labels.size > 0: label_header = np.array([int(id_) for id_ in valid_ids]).reshape(1, -1) if label_header.shape[1] != combined_labels.shape[1]: print(f"Warning: Mismatch between label header ({label_header.shape[1]}) and label columns ({combined_labels.shape[1]}). Trimming header.") label_header = label_header[:, :combined_labels.shape[1]] combined_labels_with_header = np.vstack((label_header, combined_labels)) np.savetxt(LABEL_OUTPUT_FILE, combined_labels_with_header, delimiter=",", fmt="%g") print(f"Processed labels saved to {LABEL_OUTPUT_FILE}") else: print("No valid label files found.")