from concurrent.futures import ThreadPoolExecutor # for paralell processing import pandas as pd import os IN_PATH="./data/" def process_file(file_num,file_path): data=pd.read_csv(file_path) to_group_by=data["Letters"].unique() to_group_by.sort()# bruh unsorted results=[] for letter in to_group_by: numbers=data[data["Letters"]==letter]["Numbers"] median=numbers.median() standard_deviation=numbers.std() results.append([file_num,letter,median,standard_deviation]) return pd.DataFrame(columns=["fileId","letter","median","stdDev"],data=results) files=[IN_PATH+ x for x in os.listdir(IN_PATH)] with ThreadPoolExecutor(max_workers=8) as executor: results = executor.map(process_file, range(len(files)),files) result=pd.concat(results,ignore_index=True) result.sort_values(by=["fileId","letter"]) print(result) to_group_by=result["letter"].unique() for letter in to_group_by: global_median_by_letter=result[result["letter"]==letter]["median"] median=global_median_by_letter.median() standard_deviation=global_median_by_letter.std() print(f"{letter}: median:{float(median)}, std:{float(standard_deviation)}")