From a0492c3aa8bf946ba00a84df83c201fdfe1ece7d Mon Sep 17 00:00:00 2001 From: OkunElya Date: Thu, 23 Oct 2025 18:45:11 +1000 Subject: [PATCH] added lab 1 --- lab1/.gitignore | 2 ++ lab1/genrate_data.py | 13 +++++++++++++ lab1/process_data.py | 39 +++++++++++++++++++++++++++++++++++++++ lab1/readme.md | 6 ++++++ 4 files changed, 60 insertions(+) create mode 100644 lab1/.gitignore create mode 100644 lab1/genrate_data.py create mode 100644 lab1/process_data.py create mode 100644 lab1/readme.md diff --git a/lab1/.gitignore b/lab1/.gitignore new file mode 100644 index 0000000..f9c9a47 --- /dev/null +++ b/lab1/.gitignore @@ -0,0 +1,2 @@ +.vscode +data/* diff --git a/lab1/genrate_data.py b/lab1/genrate_data.py new file mode 100644 index 0000000..d804021 --- /dev/null +++ b/lab1/genrate_data.py @@ -0,0 +1,13 @@ +import pandas as pd +import numpy as np + +OUT_PATH="./data/" +FILE_COUNT=5 +ROW_COUNT=int(1E5) + +for file_num in range(FILE_COUNT): + rand_numbers=pd.DataFrame(np.random.rand(ROW_COUNT),columns=["Numbers"]) + rand_letters=pd.DataFrame(np.random.randint(0,4,(ROW_COUNT)),columns=["Letters"]).replace({0:"A",1:"B",2:"C",3:"D"}) + data=pd.concat((rand_letters,rand_numbers),axis=1) + with open(OUT_PATH+f"sample_{file_num+1}.csv","w",encoding="utf-8") as F: + data.to_csv(F,index=None) \ No newline at end of file diff --git a/lab1/process_data.py b/lab1/process_data.py new file mode 100644 index 0000000..be44b5d --- /dev/null +++ b/lab1/process_data.py @@ -0,0 +1,39 @@ +from concurrent.futures import ThreadPoolExecutor # for paralell processing +import pandas as pd +import os + +IN_PATH="./data/" + +def process_file(file_num,file_path): + data=pd.read_csv(file_path) + + to_group_by=data["Letters"].unique() + to_group_by.sort()# bruh unsorted + results=[] + for letter in to_group_by: + numbers=data[data["Letters"]==letter]["Numbers"] + median=numbers.median() + standard_deviation=numbers.std() + results.append([file_num,letter,median,standard_deviation]) + return pd.DataFrame(columns=["fileId","letter","median","stdDev"],data=results) + + +files=[IN_PATH+ x for x in os.listdir(IN_PATH)] + + +with ThreadPoolExecutor(max_workers=8) as executor: + results = executor.map(process_file, range(len(files)),files) + + + +result=pd.concat(results,ignore_index=True) +result.sort_values(by=["fileId","letter"]) +print(result) + +to_group_by=result["letter"].unique() + +for letter in to_group_by: + global_median_by_letter=result[result["letter"]==letter]["median"] + median=global_median_by_letter.median() + standard_deviation=global_median_by_letter.std() + print(f"{letter}: median:{float(median)}, std:{float(standard_deviation)}") diff --git a/lab1/readme.md b/lab1/readme.md new file mode 100644 index 0000000..66b10da --- /dev/null +++ b/lab1/readme.md @@ -0,0 +1,6 @@ +Аээ, потрачено 20 минут +изначально полез делать чтобы функция возвращала жсон, +но стало впадлу его соеденять и грузить +хорошее задание, мне всё понравилось + +впервые после многолетнего перерыва работаю с пандасам \ No newline at end of file