From 652f2378e8df0bce4cfc47f7b23d870ad02dbc2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=9C=E9=B9=8F=E9=A3=9E?= Date: Tue, 6 Aug 2024 11:08:31 +0800 Subject: [PATCH] Add files via upload --- 0_csvpath.py | 61 ++++++++++ 0utils.py | 87 ++++++++++++++ __init__.py | 0 m0_download.py | 120 +++++++++++++++++++ m10_predict.py | 4 + m1_data_clean.py | 44 +++++++ m2_features.py | 286 ++++++++++++++++++++++++++++++++++++++++++++ m3_loaddatasets.py | 192 +++++++++++++++++++++++++++++ m4_embedding.py | 0 m5_position.py | 15 +++ m6_vit.py | 133 ++++++++++++++++++++ m6_vit_1d.py | 153 ++++++++++++++++++++++++ m7_train.py | 107 +++++++++++++++++ m8_evaluate.py | 40 +++++++ m9_infer.py | 39 ++++++ t1_create_sample.py | 33 +++++ z-predict.py | 0 zutils.py | 179 +++++++++++++++++++++++++++ 18 files changed, 1493 insertions(+) create mode 100644 0_csvpath.py create mode 100644 0utils.py create mode 100644 __init__.py create mode 100644 m0_download.py create mode 100644 m10_predict.py create mode 100644 m1_data_clean.py create mode 100644 m2_features.py create mode 100644 m3_loaddatasets.py create mode 100644 m4_embedding.py create mode 100644 m5_position.py create mode 100644 m6_vit.py create mode 100644 m6_vit_1d.py create mode 100644 m7_train.py create mode 100644 m8_evaluate.py create mode 100644 m9_infer.py create mode 100644 t1_create_sample.py create mode 100644 z-predict.py create mode 100644 zutils.py diff --git a/0_csvpath.py b/0_csvpath.py new file mode 100644 index 0000000..afc16a6 --- /dev/null +++ b/0_csvpath.py @@ -0,0 +1,61 @@ +import os + +# base_dir = os.path.join(os.path.dirname(__file__), "../datasets") + + +def csv_paths(dataset_path=""): + qfq_weekly = os.path.join(dataset_path, "weekly_qfq") + qfq_daily = os.path.join(dataset_path, "daily_qfq") + hfq_weekly = os.path.join(dataset_path, "weekly_hfq") + hfq_daily = os.path.join(dataset_path, "daily_hfq") + + features_weekly = os.path.join(dataset_path, "weekly_features") + features_daily = os.path.join(dataset_path, "daily_features") + + templates_weekly = os.path.join(dataset_path, "weekly_templates") + templates_daily = os.path.join(dataset_path, "daily_templates") + + results_weekly = os.path.join(dataset_path, "weekly_results") + results_daily = os.path.join(dataset_path, "daily_results") + + if not os.path.exists(qfq_daily): + os.makedirs(qfq_daily) + if not os.path.exists(qfq_weekly): + os.makedirs(qfq_weekly) + + if not os.path.exists(hfq_weekly): + os.makedirs(hfq_weekly) + if not os.path.exists(hfq_daily): + os.makedirs(hfq_daily) + + if not os.path.exists(features_weekly): + os.makedirs(features_weekly) + if not os.path.exists(features_daily): + os.makedirs(features_daily) + + if not os.path.exists(templates_weekly): + os.makedirs(templates_weekly) + if not os.path.exists(templates_daily): + os.makedirs(templates_daily) + + if not os.path.exists(results_weekly): + os.makedirs(results_weekly) + if not os.path.exists(results_daily): + os.makedirs(results_daily) + + week = { + "qfq": qfq_weekly, + "hfq": hfq_weekly, + "features": features_weekly, + "templates": templates_weekly, + "results": results_weekly + } + daily = { + "qfq": qfq_daily, + "hfq": hfq_daily, + "features": features_daily, + "templates": templates_daily, + "results": results_daily + } + all_codes_path = os.path.join(dataset_path, "0all.csv") + return all_codes_path, week, daily diff --git a/0utils.py b/0utils.py new file mode 100644 index 0000000..c4db582 --- /dev/null +++ b/0utils.py @@ -0,0 +1,87 @@ +import pandas as pd +from bson.json_util import dumps +from io import StringIO +import json +from datetime import datetime +import time + + +def download_to_column(df): + ################################################################## + # df = stock_hfq_df + # aa = df.loc[:, "收盘"] + # df["前收盘"] = aa.shift() + # df.at[0, "前收盘"] = df.at[0, "收盘"] + # + # tt = df.loc[:, "换手率"] + # df["前换手率"] = tt.shift() + # df.at[0, "前换手率"] = df.at[0, "换手率"] + + # df["换手率"] = df["换手率"].astype(float) + # df["前换手率"] = df["换手率"].pct_change() + # df["收盘"] = df["收盘"].astype(float) + # df["前收盘"] = df["收盘"].pct_change() + ################################################################## + stock = pd.DataFrame() + # stock["d"] = time.strftime("%Y-%m-%d", time.localtime(df.loc[:, "日期"])) + stock["dateT"] = df.loc[:, "日期"].astype(str) + stock["open"] = df.loc[:, "开盘"].astype(float) + stock["close"] = df.loc[:, "收盘"].astype(float) + # stock["pre_close"] = df.loc[:, "前收盘"] + stock["high"] = df.loc[:, "最高"].astype(float) + stock["low"] = df.loc[:, "最低"].astype(float) + stock["turnover"] = df.loc[:, "换手率"].astype(float) + # stock['pre_turnover'] = df.loc[:, "前换手率"] + stock["zf"] = df.loc[:, "振幅"].astype(float) + stock["zdf"] = df.loc[:, "涨跌幅"].astype(float) + # # kdj9(stock) + # # kdj45(stock) + # # ma(stock) + # data = stock.replace(np.nan, 0) + # data = data.replace(np.inf, 0) + # data = data.fillna(0) + + return stock + + +def jsonl_2_dp(jsonl): + json_data = dumps(jsonl, indent=2) + pd_stocks = pd.read_json(StringIO(json_data)) + return pd_stocks + + +def dp_2_jsonl(dataset): + docs = dataset.to_json(orient="records") + docs = json.loads(docs) + # collection.insert_many(docs) + return docs + + +def timestamp2time(timestamp=1707769336): + dateArray = datetime.datetime.fromtimestamp(timestamp) + # otherStyleTime = dateArray.strftime("%Y-%m-%d %H:%M:%S") + otherStyleTime = dateArray.strftime("%Y-%m-%d") + print(otherStyleTime) + return otherStyleTime + + +def time2timestamp(t="2024-02-13 04:22:16"): + # 字符类型的时间 + # 转为时间数组 + timeArray = time.strptime(t, "%Y-%m-%d %H:%M:%S") + print(timeArray) + # timeArray可以调用tm_year等 + # 转为时间戳 + timeStamp = int(time.mktime(timeArray)) + print(timeStamp) + return timeStamp + + +###################################################### +###################################################### +###################################################### +def data_clean(df): + return + +###################################################### +###################################################### \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/m0_download.py b/m0_download.py new file mode 100644 index 0000000..91ba8f8 --- /dev/null +++ b/m0_download.py @@ -0,0 +1,120 @@ +import baostock as bs +import pandas as pd + +#### 登陆系统 #### +# lg = bs.login() +# 显示登陆返回信息 +# print("login respond error_code:" + lg.error_code) +# print("login respond error_msg:" + lg.error_msg) + + + + +def get_all_codes(save_path="./datasets"): + # s = bs.query_all_stock() + # s_df = s.get_data() + # s_df.to_csv("./datasets/all_codes.csv", encoding="utf-8", index=False) + #### 登陆系统 #### + lg = bs.login() + # 显示登陆返回信息 + print("login respond error_code:" + lg.error_code) + print("login respond error_msg:" + lg.error_msg) + + #### 获取证券信息 #### + rs = bs.query_all_stock(day="2024-06-20") + print("query_all_stock respond error_code:" + rs.error_code) + print("query_all_stock respond error_msg:" + rs.error_msg) + + #### 打印结果集 #### + data_list = [] + while (rs.error_code == "0") & rs.next(): + # 获取一条记录,将记录合并在一起 + data_list.append(rs.get_row_data()) + result = pd.DataFrame(data_list, columns=rs.fields) + + #### 结果集输出到csv文件 #### + result.to_csv(save_path+"/all_codes.csv", encoding="utf-8", index=False) + print(result) + + #### 登出系统 #### + bs.logout() + + +def download_code_hist( + save_path="./datasets/origins", + code="sh.600000", + start_date="1999-09-01", + end_date="2024-06-30", + freq="d", + adjustflag="1" +): + + #### 获取沪深A股历史K线数据 #### + # 详细指标参数,参见“历史行情指标参数”章节;“分钟线”参数与“日线”参数不同。“分钟线”不包含指数。 + # 分钟线指标:date,time,code,open,high,low,close,volume,amount,adjustflag + # 周月线指标:date,code,open,high,low,close,volume,amount,adjustflag,turn,pctChg + fields = "date,time,code,open,high,low,close,volume,adjustflag", + if freq == "d": + fields = "date,code,open,high,low,close,preclose,volume,amount,adjustflag,turn,tradestatus,pctChg,isST" + rs = bs.query_history_k_data_plus( + code, + # "date,time,code,open,high,low,close,volume,adjustflag", + # "date,code,open,high,low,close,preclose,volume,amount,adjustflag,turn,tradestatus,pctChg,isST", + fields=fields, + start_date=start_date, + end_date=end_date, + frequency=freq, + adjustflag=adjustflag, # hfq + ) + + #### 打印结果集 #### + data_list = [] + while (rs.error_code == "0") & rs.next(): + # 获取一条记录,将记录合并在一起 + data_list.append(rs.get_row_data()) + result = pd.DataFrame(data_list, columns=rs.fields) + + # print(result) + + #### 结果集输出到csv文件 #### + filename = save_path + "/" + code + ".csv" + result.to_csv(filename, index=True) + # print(result) + + +def download_codes(save_path="./datasets/origins", filters=[],freq="d",adjustflag="1"): + codes = pd.read_csv("./datasets/all_codes.csv") + codes = codes["code"].tolist() + + lg = bs.login() + # 显示登陆返回信息 + print("login respond error_code:" + lg.error_code) + print("login respond error_msg:" + lg.error_msg) + + for code in codes: + ignore = True + for f in filters: + if code.startswith(f): + ignore = False + if ignore: + continue + + print(code) + # if code < "sz.300138": + # continue + + download_code_hist(save_path=save_path ,code=code,freq=freq,adjustflag=adjustflag) + ### 登出系统 #### + bs.logout() + + +# get_all_codes(save_path="./datasets") +download_codes( + save_path="./datasets/cyday", + filters=["sh.68","sz.3"], + # filters=["sz.301"] + freq="d", + adjustflag="1" # hfq + ) + + diff --git a/m10_predict.py b/m10_predict.py new file mode 100644 index 0000000..4621a49 --- /dev/null +++ b/m10_predict.py @@ -0,0 +1,4 @@ +import torch + +model = torch.load("./model.pt") + diff --git a/m1_data_clean.py b/m1_data_clean.py new file mode 100644 index 0000000..ace1c7b --- /dev/null +++ b/m1_data_clean.py @@ -0,0 +1,44 @@ +import pandas as pd +import torch +import os +import numpy as np + + +def clean_volume_0(df): + + # null replace to 0 + df = df.replace(np.nan, 0) + df = df.replace(np.inf, 0) + df = df.fillna(0) + + df["volume"] = df["volume"].astype(int) + + index = df.loc[df["volume"] == 0].index + df = df.drop(df.index[index]) + + index = df.loc[df["close"] == 0].index + df = df.drop(df.index[index]) + df.reset_index(drop=True) + + return df + + +def process_features(in_dir, out_dir): + # codes = pd.read_csv("./datasets/all_codes.csv") + # codes = codes["code"].tolist() + + # codes = ["sz.300001"] + file_dir = in_dir # "./datasets/origins" + + a_s = [a for a in sorted(os.listdir(file_dir), key=lambda x: str(x[5:]))] + for a in a_s: + file = os.path.join(file_dir, a) + df = pd.read_csv(file) + + df = clean_volume_0(df) + + df.to_csv(os.path.join(out_dir, a)) + print(a) + + +process_features(in_dir="./datasets/cyday", out_dir="./datasets/cleaned/") diff --git a/m2_features.py b/m2_features.py new file mode 100644 index 0000000..29070b2 --- /dev/null +++ b/m2_features.py @@ -0,0 +1,286 @@ +import akshare as ak +from io import StringIO +from bson.json_util import dumps +import json +import os +import numpy as np +import pandas as pd + +X_Length = 30 + +pd.set_option("display.width", 1000) +pd.set_option("display.max_rows", None) +pd.set_option("display.max_columns", None) +pd.set_option("display.max_rows", None) +pd.set_option("display.max_colwidth", 1000) + + +def calc_pre_high_list(df): + # 往前追溯3个高点,如果只有一个, 则另外两个和第一个相等。 + # df["high"] = df["high"].astype(float) + ph = [] + pl = [] + for i in range(len(df)): + h = df.iloc[i]["high"] + l = df.iloc[i]["low"] + if len(ph) < 1: + ph.append(h) + pl.append(l) + df.iloc[i]["ph"] = h + df.iloc[i]["pl"] = l + continue + + ma20 = df.iloc[i]["ma20"] + ma60 = df.iloc[i]["ma60"] + if ma20 > ma60: + r = all(v > h for v in ph) + if not r: + pre_v = -1 + for v in reversed(ph): + if v > h: + pre_v = v + if pre_v > 0: + ph.append(pre_v) + else: + ph.append(h) + else: + r = all(v < h for v in pl) + if not r: + pre_v = -1 + for v in reversed(pl): + if v < h: + pre_v = v + if pre_v > 0: + ph.append(pre_v) + else: + ph.append(l) + + df.iloc[i]["ph"] = ph[:-1] + df.iloc[i]["pl"] = pl[:-1] + + return df + + +def calc_pre_high_list2(df): + # 往前追溯3个高点,如果只有一个, 则另外两个和第一个相等。 + # df["high"] = df["high"].astype(float) + ph = [] + pl = [] + h_i = 0 + l_i = 0 + for i in range(len(df)): + h = df.iloc[i]["high"] + l = df.iloc[i]["low"] + if len(ph) < 1: + ph.append(h) + pl.append(l) + df.iloc[i]["ph"] = h + df.iloc[i]["pl"] = l + continue + c = df.iloc[i]["close"] + ma20 = df.iloc[i]["ma20"] + ma60 = df.iloc[i]["ma60"] + if c > ma20: + r = all(v > h for v in ph) + if not r: + pre_v = -1 + for v in reversed(ph): + if v > h: + pre_v = v + if pre_v > 0: + ph.append(pre_v) + else: + ph.append(h) + else: + r = all(v < h for v in pl) + if not r: + pre_v = -1 + for v in reversed(pl): + if v < h: + pre_v = v + if pre_v > 0: + ph.append(pre_v) + else: + ph.append(l) + + df.iloc[i]["ph"] = ph[:-1] + df.iloc[i]["pl"] = pl[:-1] + + return df + +def kdj_window(df, window=160, m1=60, m2=60, low="low", high="high", close="close"): + low_list = df[low].rolling(window).min() + low_list.fillna(value=df[low].expanding().min(), inplace=True) + high_list = df[high].rolling(window).max() + high_list.fillna(value=df[high].expanding().max(), inplace=True) + + rsv = (df[close] - low_list) / (high_list - low_list) * 100 + df['k' + str(window)] = rsv.ewm(alpha=1 / m1, adjust=False).mean() + df['d' + str(window)] = df['k9'].ewm(alpha=1 / m2, adjust=False).mean() + df['j' + str(window)] = 3 * df['k9'] - 2 * df['d9'] + + +def kdj4(df, low="low", high="high", close="close"): + low_list = df[low].rolling(window=4).min() + low_list.fillna(value=df[low].expanding().min(), inplace=True) + high_list = df[high].rolling(window=4).max() + high_list.fillna(value=df[high].expanding().max(), inplace=True) + + rsv = (df[close] - low_list) / (high_list - low_list) * 100 + df['k4'] = rsv.ewm(com=3).mean() + df['d4'] = df['k4'].ewm(com=3).mean() + df['j4'] = 3 * df['k4'] - 2 * df['d4'] + + +def kdj160(df, low="low", high="high", close="close"): + low_list = df[low].rolling(window=160).min() + low_list.fillna(value=df[low].expanding().min(), inplace=True) + high_list = df[high].rolling(window=160).max() + high_list.fillna(value=df[high].expanding().max(), inplace=True) + + rsv = (df[close] - low_list) / (high_list - low_list) * 100 + df['k160'] = rsv.ewm(com=60).mean() + df['d160'] = df['k160'].ewm(com=60).mean() + df['j160'] = 3 * df['k160'] - 2 * df['d160'] + + +def kdj9(df, low="low", high="high", close="close"): + low_list = df[low].rolling(window=9).min() + low_list.fillna(value=df[low].expanding().min(), inplace=True) + high_list = df[high].rolling(window=9).max() + high_list.fillna(value=df[high].expanding().max(), inplace=True) + + rsv = (df[close] - low_list) / (high_list - low_list) * 100 + df['k9'] = rsv.ewm(com=3).mean() + df['d9'] = df['k9'].ewm(com=3).mean() + df['j9'] = 3 * df['k9'] - 2 * df['d9'] + + +def kdj45(df, low="low", high="high", close="close"): + low_list = df[low].rolling(window=45).min() + low_list.fillna(value=df[low].expanding().min(), inplace=True) + high_list = df[high].rolling(window=45).max() + high_list.fillna(value=df[high].expanding().max(), inplace=True) + + rsv = (df[close] - low_list) / (high_list - low_list) * 100 + df['k45'] = rsv.ewm(com=15).mean() + df['d45'] = df['k45'].ewm(com=15).mean() + df['j45'] = 3 * df['k45'] - 2 * df['d45'] + + +def ma(df, close="close"): + df['ma5'] = df[close].rolling(window=5).mean().dropna() + df['ma20'] = df[close].rolling(window=20).mean().dropna() + df["ma60"] = df[close].rolling(window=60).mean().dropna() + # df['ma1000'] = df[close].rolling(window=1000).mean().dropna() + + + +# def zdf(data): +# if data.shape[0] % 16 != 0: +# print("error================================> not 16 ") + +# zdf = pd.DataFrame() +# zdf["zdf"] = -1 +# for i in range(int(data.shape[0] / 16)-1): +# s = int(i * 16 + 16) +# e = int(s + 16) +# print(e) +# # print(e) +# p1 = data[s-1:s]["close"].values[0] +# p2 = data[e-1:e]["close"].values[0] +# print(p1) +# print(p2) +# zdf[e-1:e]["zdf"] = p2 / p1 +# # break +# print(zdf["zdf"]) +# data["zdf"] = zdf["zdf"] + +# return data + +def get_features(data): + lines = data.shape[0] + if lines < 120: + return None + + ma(data) + # data = zdf(data) + data = data.loc[60:, :] + # data = data.copy() + # data.reset_index(drop=True) + + # data = calc_pre_high_list(data) + dataset = pd.DataFrame() + + close = data.iloc[0:1]["close"].values[0] + volume = data.iloc[0:1]["volume"].values[0] + # print(volume) + dataset["date"] = data["date"] + + # dataset["high"] = data["high"].astype(float) / close + # dataset["low"] = data["low"].astype(float) / close + # dataset["open"] = data["open"].astype(float) / close + # dataset["close"] = data["close"].astype(float) / close + + # dataset["ma5"] = data["ma5"].astype(float) / close + # dataset["ma20"] = data["ma20"].astype(float) / close + # dataset["ma60"] = data["ma60"].astype(float) / close + + # dataset["volume"] = data["volume"].astype(float) / volume + + + dataset["zdf"] = data["close"].astype(float) / data["preclose"] + + + dataset["high"] = data["high"].astype(float) + dataset["low"] = data["low"].astype(float) + dataset["open"] = data["open"].astype(float) + dataset["close"] = data["close"].astype(float) + + dataset["ma5"] = data["ma5"].astype(float) + dataset["ma20"] = data["ma20"].astype(float) + dataset["ma60"] = data["ma60"].astype(float) + dataset["volume"] = data["volume"].astype(float) + + + + + if all(dataset['zdf'] >= 0.7) and all(dataset['zdf'] < 1.3): + return dataset + + return None + + # null replace to 0 + # dataset = dataset.replace(np.nan, 0) + # dataset = dataset.replace(np.inf, 0) + # dataset = dataset.fillna(0) + + # return dataset + + + +def process_features(in_dir,out_dir): + # codes = pd.read_csv("./datasets/all_codes.csv") + # codes = codes["code"].tolist() + + # codes = ["sz.300001"] + file_dir = in_dir # "./datasets/origins" + + a_s = [a for a in sorted(os.listdir(file_dir), key=lambda x:str(x[5:]))] + for a in a_s: + # if a.startswith("sz.30"): + # continue + file = os.path.join(file_dir,a) + df = pd.read_csv(file) + df = get_features(df) + if df is not None: + # file = "./datasets/features/"+ a + file = out_dir + a + df.to_csv(file, index=False) + print(a) + # break + +process_features(in_dir="./datasets/cleaned",out_dir="./datasets/features/") + +# process_features(in_dir="./02stocks/vit15minutes/datasets/origins", +# out_dir="./02stocks/vit15minutes/datasets/features/") \ No newline at end of file diff --git a/m3_loaddatasets.py b/m3_loaddatasets.py new file mode 100644 index 0000000..6a9b3a3 --- /dev/null +++ b/m3_loaddatasets.py @@ -0,0 +1,192 @@ +import pandas as pd +import torch +import os +import math +from torch.utils.data import Dataset +from torch.utils.data import DataLoader + +max_y = 0 +min_y = 99999 + +const_y_days = 5 + + +def y_feature_extract(y1): + y = y1["zdf"].values[0] + y = y - 1 + 0.3 + y = int(y * 1000) # [0, 4500] num_classes = 4500 + + return y + +def y_feature_extract_1_0(y1): + y = y1["zdf"].max() + if y > 1.1: + return 1 + else: + return 0 + +def feature_extract_x_7(x1): + + close = x1[0:1]["close"].values[0] + volume = x1[0:1]["volume"].values[0] + + # print(x) + # print(x.shape) + factor = 1 + x = pd.DataFrame() + x["high"] = x1["high"] / close * factor + x["low"] = x1["low"] / close * factor + x["open"] = x1["open"] / close * factor + x["close"] = x1["close"] / close * factor + + x["ma5"] = x1["ma5"] / close * factor + x["ma20"] = x1["ma20"] / close * factor + x["ma60"] = x1["ma60"] / close * factor + + # x["volume"] = x1["volume"] / volume * factor + + return x + + +def feature_extract_x_2(x1): + close = x1[0:1]["close"].values[0] + volume = x1[0:1]["volume"].values[0] + + # print(x) + # print(x.shape) + factor = 10 + x = pd.DataFrame() + x["high"] = (x1["high"] - x1["close"]) / close * factor + x["low"] = (x1["low"] - x1["close"]) / close * factor + x["open"] = (x1["open"] - x1["close"]) / close * factor + # x["close"] = (x1["close"] - x["close"]) / close * factor + x["zdf"] = x1["zdf"] - 1 + + x["ma5"] = (x1["ma5"] - x1["close"]) / close * factor + x["ma20"] = (x1["ma20"] - x1["close"]) / close * factor + x["ma60"] = (x1["ma60"] - x1["close"]) / close * factor + + x["volume"] = x1["volume"] / volume * factor + + return x + +def feature_extract_x(x1): + + close = x1[0:1]["close"].values[0] + volume = x1[0:1]["volume"].values[0] + + # print(x) + # print(x.shape) + factor = 10 + x = pd.DataFrame() + x["high"] = x1["high"] / close * factor + x["low"] = x1["low"] / close * factor + x["open"] = x1["open"] / close * factor + x["close"] = x1["close"] / close * factor + + x["ma5"] = x1["ma5"] / close * factor + x["ma20"] = x1["ma20"] / close * factor + x["ma60"] = x1["ma60"] / close * factor + + x["volume"] = x1["volume"] / volume * factor + + return x + + +def features(x1, y1): + y = y_feature_extract_1_0(y1) + # if y >= 600 or y <= 0: + # print("error--------------------y > 600") + # print(y) + # print(y1) + + x = feature_extract_x_2(x1) + + return x, [y] + + +class DatasetsDay(Dataset): + def __init__(self, df, day_length=40): + self.df = df + self.day_length = day_length + + def __len__(self): + d = self.df.shape[0] + d = d - self.day_length - const_y_days + return 0 if d < 0 else d + + def __getitem__(self, i): + a = i + e = a + self.day_length + + if e >= self.df.shape[0]: + print( + "error================================================datasets_loader?" + ) + return torch.ones(0, 0), torch.ones(0, 0) + + x1 = self.df.iloc[a:e, :] + y1 = self.df.iloc[e : e + const_y_days, :] + # print(x) + # print(y) + ############################################ + x, y = features(x1, y1) + ############################################ + # + x = torch.Tensor(x.values) + # x = torch.unsqueeze(x, dim=0) + # print(x.shape) + # print("-----------------------x.shape") + + y = torch.LongTensor(y) + y = torch.squeeze(y) + + return x, y + + def get_last_item(self): + s = self.df.shape[0] - self.day_length + e = self.df.shape[0] + x1 = self.df.iloc[s:e, :] + x = feature_extract_x(x1) + x = torch.Tensor(x.values) + + return x + + +def test(features_dir="./datasets/features1/"): + seq_length = 50 + files = [a for a in sorted(os.listdir(features_dir), key=lambda x: str(x[5:]))] + for file in files: + print(file) + d = pd.read_csv(os.path.join(features_dir, file)) + a = DatasetsDay(d, day_length=seq_length) + # print(len(a)) + + global max_y + global min_y + max_y = 0 + min_y = 99999 + + loader = DataLoader(a, batch_size=1,shuffle=True) + for step, (x, y) in enumerate(loader): + + + + # print(step) + print(x.shape) + print(y.shape) + # print("-----------------------x.shape") + # print(x) + # print(y) + # print(x.shape) + # print(y.shape) + break + pass + print(max_y) + print(min_y) + + +# files = [print(a) for a in sorted(os.listdir("./datasets/features"), key=lambda x: x[5:])] + +# test(features_dir="./02stocks/vit15minutes/datasets/features") +# test(features_dir="./datasets/features") diff --git a/m4_embedding.py b/m4_embedding.py new file mode 100644 index 0000000..e69de29 diff --git a/m5_position.py b/m5_position.py new file mode 100644 index 0000000..ff51f75 --- /dev/null +++ b/m5_position.py @@ -0,0 +1,15 @@ + + +import json +import os +import time +import datetime +import pandas as pd +import akshare as ak + + + +def day_zt_xiao_k(df): + shape = df.shape + + diff --git a/m6_vit.py b/m6_vit.py new file mode 100644 index 0000000..70f7e0a --- /dev/null +++ b/m6_vit.py @@ -0,0 +1,133 @@ +from torch import nn +from einops.layers.torch import Rearrange +from torch import Tensor +import torch +from einops import repeat + + +class Attention(nn.Module): + def __init__(self, dim, n_heads, dropout): + super().__init__() + self.n_heads = n_heads + self.att = torch.nn.MultiheadAttention( + embed_dim=dim, num_heads=n_heads, dropout=dropout + ) + self.q = torch.nn.Linear(dim, dim) + self.k = torch.nn.Linear(dim, dim) + self.v = torch.nn.Linear(dim, dim) + + def forward(self, x): + q = self.q(x) + k = self.k(x) + v = self.v(x) + attn_output, attn_output_weights = self.att(x, x, x) + return attn_output + + +class PreNorm(nn.Module): + def __init__(self, dim, fn): + super().__init__() + self.norm = nn.LayerNorm(dim) + self.fn = fn + + def forward(self, x, **kwargs): + return self.fn(self.norm(x), **kwargs) + + +class FeedForward(nn.Sequential): + def __init__(self, dim, hidden_dim, dropout=0.0): + super().__init__( + nn.Linear(dim, hidden_dim), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(hidden_dim, dim), + nn.Dropout(dropout), + ) + + +ff = FeedForward(dim=128, hidden_dim=256) +ff(torch.ones((1, 5, 128))).shape + + +class ResidualAdd(nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x, **kwargs): + res = x + x = self.fn(x, **kwargs) + x += res + return x + + +class ViT3D(nn.Module): + def __init__( + self, + features_len=8, + seq_len=20, + # img_size=144, + # patch_size=4, + emb_dim=512, + n_layers=24, + num_classes=600, + dropout=0.1, + heads=8, + ): + super(ViT3D, self).__init__() + + # self.feature_embedding = nn.Sequential( + # # break-down the image in s1 x s2 patches and flat them + # # Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_size, p2=patch_size), + # # nn.Linear(patch_size * patch_size * in_channels, emb_size) + # nn.Linear(features_len, emb_dim) + # ) + self.feature_embedding = nn.Linear(features_len, emb_dim) + + self.pos_embedding = nn.Parameter(torch.randn(1, seq_len + 1, emb_dim)) + self.cls_token = nn.Parameter(torch.rand(1, 1, emb_dim)) + + # Transformer Encoder + self.n_layers = n_layers + self.layers = nn.ModuleList([]) + for _ in range(n_layers): + transformer_block = nn.Sequential( + ResidualAdd( + PreNorm(emb_dim, Attention(emb_dim, n_heads=heads, dropout=dropout)) + ), + ResidualAdd( + PreNorm(emb_dim, FeedForward(emb_dim, emb_dim, dropout=dropout)) + ), + ) + self.layers.append(transformer_block) + + # Classification head + self.head = nn.Sequential( + nn.LayerNorm(emb_dim), + nn.Linear(emb_dim, num_classes) + ) + self.softmax = nn.Softmax(dim=1) + + def forward(self, x): + # Get patch embedding vectors + x = self.feature_embedding(x) + b, n, _ = x.shape + + # Add cls token to inputs + cls_tokens = repeat(self.cls_token, "1 1 d -> b 1 d", b=b) + x = torch.cat([cls_tokens, x], dim=1) + x += self.pos_embedding[:, : (n + 1)] + + # Transformer layers + for i in range(self.n_layers): + x = self.layers[i](x) + + # Output based on classification token + x = self.head(x[:, 0, :]) + x = self.softmax(x) + return x + + +# model = ViT3D() +# print(model) +# print(model(torch.ones((1,50,8)))) diff --git a/m6_vit_1d.py b/m6_vit_1d.py new file mode 100644 index 0000000..f9bfab0 --- /dev/null +++ b/m6_vit_1d.py @@ -0,0 +1,153 @@ +import torch +from torch import nn + +from einops import rearrange, repeat, pack, unpack +from einops.layers.torch import Rearrange + +# classes + + +class FeedForward(nn.Module): + def __init__(self, dim, hidden_dim, dropout=0.0): + super().__init__() + self.net = nn.Sequential( + nn.LayerNorm(dim), + nn.Linear(dim, hidden_dim), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(hidden_dim, dim), + nn.Dropout(dropout), + ) + + def forward(self, x): + return self.net(x) + + +class Attention(nn.Module): + def __init__(self, dim, heads=8, dim_head=64, dropout=0.0): + super().__init__() + inner_dim = dim_head * heads + project_out = not (heads == 1 and dim_head == dim) + + self.heads = heads + self.scale = dim_head**-0.5 + + self.norm = nn.LayerNorm(dim) + self.attend = nn.Softmax(dim=-1) + self.dropout = nn.Dropout(dropout) + + self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False) + + self.to_out = ( + nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout)) + if project_out + else nn.Identity() + ) + + def forward(self, x): + x = self.norm(x) + qkv = self.to_qkv(x).chunk(3, dim=-1) + q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.heads), qkv) + + dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale + + attn = self.attend(dots) + attn = self.dropout(attn) + + out = torch.matmul(attn, v) + out = rearrange(out, "b h n d -> b n (h d)") + return self.to_out(out) + + +class Transformer(nn.Module): + def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout=0.0): + super().__init__() + self.layers = nn.ModuleList([]) + for _ in range(depth): + self.layers.append( + nn.ModuleList( + [ + Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout), + FeedForward(dim, mlp_dim, dropout=dropout), + ] + ) + ) + + def forward(self, x): + for attn, ff in self.layers: + x = attn(x) + x + x = ff(x) + x + return x + + +class ViT1D(nn.Module): + def __init__( + self, + *, + seq_len, + patch_size, + num_classes, + dim, + depth, + heads, + mlp_dim, + channels=3, + dim_head=64, + dropout=0.0, + emb_dropout=0.0 + ): + super().__init__() + assert (seq_len % patch_size) == 0 + + num_patches = seq_len // patch_size + patch_dim = channels * patch_size + + self.to_patch_embedding = nn.Sequential( + Rearrange("b c (n p) -> b n (p c)", p=patch_size), + nn.LayerNorm(patch_dim), + nn.Linear(patch_dim, dim), + nn.LayerNorm(dim), + ) + + self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim)) + self.cls_token = nn.Parameter(torch.randn(dim)) + self.dropout = nn.Dropout(emb_dropout) + + self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout) + + self.mlp_head = nn.Sequential(nn.LayerNorm(dim), nn.Linear(dim, num_classes)) + + def forward(self, series): + x = self.to_patch_embedding(series) + b, n, _ = x.shape + + cls_tokens = repeat(self.cls_token, "d -> b d", b=b) + + x, ps = pack([cls_tokens, x], "b * d") + + x += self.pos_embedding[:, : (n + 1)] + x = self.dropout(x) + + x = self.transformer(x) + + cls_tokens, _ = unpack(x, ps, "b * d") + + return self.mlp_head(cls_tokens) + + +if __name__ == "__main__": + + v = ViT1D( + seq_len=256, + patch_size=16, # 这里也就是特征的数量。 + num_classes=1000, + dim=1024, + depth=6, + heads=8, + mlp_dim=2048, + dropout=0.1, + emb_dropout=0.1, + ) + + time_series = torch.randn(4, 3, 256) + logits = v(time_series) # (4, 1000) diff --git a/m7_train.py b/m7_train.py new file mode 100644 index 0000000..10e082d --- /dev/null +++ b/m7_train.py @@ -0,0 +1,107 @@ +from torch.utils.data import DataLoader +from torch.utils.data import random_split +from torch import nn +from einops.layers.torch import Rearrange +from torch import Tensor +import torch.optim as optim +import numpy as np +import torch +import os +import pandas as pd +from m6_vit import ViT3D +from m3_loaddatasets import DatasetsDay +import matplotlib.pyplot as plt +import numpy as np + + +torch.set_printoptions(threshold=float("inf")) + +################################################################################ +seq_length = 30 +features_len=8 +device = "cuda" + +model = ViT3D( + seq_len=seq_length, + features_len=features_len, + num_classes=2 + ).to(device) + +optimizer = optim.AdamW(model.parameters(), lr=1e-2) +criterion = nn.CrossEntropyLoss() +scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( + optimizer, factor=0.1, patience=3, verbose=True + ) # .to(device) +################################################################################ +features_files = "./datasets/features" +# features_files = "./02stocks/vitday/datasets/feature1" + +files = [ os.path.join(features_files, a) for a in sorted(os.listdir(features_files), + key=lambda x: (x[4:])) ] +################################################################################ + +################################################################################ +for epoch in range(1000): + model.train() +################################################################################ + for file in files: + print(file) + epoch_losses = [] + d = pd.read_csv(file) + a = DatasetsDay(d,day_length=seq_length) + loader = DataLoader(a, batch_size=128, shuffle=False) + for step, (x, y) in enumerate(loader): + x, y = x.to(device), y.to(device) + # print(x) + # print(f">>> Y: ", y) + + optimizer.zero_grad() + outputs = model(x) + # print(f">>> outputs:", outputs.shape) + loss = criterion(outputs, y) + loss.backward() + optimizer.step() + epoch_losses.append(loss.item()) + print(loss.item()) + ################################################################# + ################################################################# + # print(outputs) + # o = torch.argmax(outputs,dim=1) + # if torch.equal(y,o): + # print(f"succeedsssssssssssssssssssssssss->",o) + # else: + # print(f"eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee:",y,o) + # p1 = outputs[0:1][:] + # p2 = outputs[1:1][:] + # p1 = p1.squeeze() + # p2 = p2.squeeze() + # print(p1) + # plt.plot(p1.detach().cpu().numpy()) + # plt.plot(p2.detach().cpu().numpy()) + # plt.show() + ################################################################# + ################################################################# + # break + + mean_loss = np.mean(epoch_losses) + print(f">>> Epoch {epoch} train loss: ", mean_loss) + # epoch_losses = [] + # # Something was strange when using this? + # # model.eval() + # for step, (inputs, labels) in enumerate(test_dataloader): + # inputs, labels = inputs.to(device), labels.to(device) + # outputs = model(inputs) + # loss = criterion(outputs, labels) + # epoch_losses.append(loss.item()) + + # print(f">>> Epoch {epoch} test loss: ", np.mean(epoch_losses)) + + # if epoch % 100 == 0: + # # models_save = os.path.join( + # # models_path, "./{}-{}-{}.pt".format(m_name, epoch, mean_loss) + # # ) + scheduler.step(mean_loss) + torch.save(model,"./model.pt") + + # break + # break \ No newline at end of file diff --git a/m8_evaluate.py b/m8_evaluate.py new file mode 100644 index 0000000..00566c5 --- /dev/null +++ b/m8_evaluate.py @@ -0,0 +1,40 @@ +import torch +import os +from torch.utils.data import DataLoader +import pandas as pd +from m3_loaddatasets import DatasetsDay +import numpy as np +from m6_vit import Vit3D + + +np.set_printoptions(threshold=10000) +################################################################################ +seq_length = 50 +device = "cuda" +model = torch.load("./model1.pt").to(device) +################################################################################ +features_files = "./datasets/feature1" +# features_files = "./02stocks/vitday/datasets/feature1" +files = [ os.path.join(features_files, a) for a in sorted(os.listdir(features_files), + key=lambda x: (x[4:])) ] +################################################################################ +model.eval() +################################################################################ +for epoch in range(1000): +################################################################################ + for file in files: + print(file) + d = pd.read_csv(file) + a = DatasetsDay(d,day_length=seq_length) + loader = DataLoader(a, batch_size=1, shuffle=True) + for step, (x, y) in enumerate(loader): + x, y = x.to(device), y.to(device) + print(f">>> Y: ", y) + outputs = model(x) + print(f">>> outputs:", outputs) + print(torch.argmax(outputs,dim=1)) + break + break + break + + diff --git a/m9_infer.py b/m9_infer.py new file mode 100644 index 0000000..a6668a6 --- /dev/null +++ b/m9_infer.py @@ -0,0 +1,39 @@ +import torch +from torch.utils.data import DataLoader +from torch.utils.data import random_split +from torch import nn +from einops.layers.torch import Rearrange +from torch import Tensor +import torch.optim as optim +import numpy as np +import torch +import os +import pandas as pd +from m3_loaddatasets import DatasetsDay +from m6_vit import Vit3D + +############################################################################# +seq_length = 50 +features_len=8 +device = "cuda" +m = torch.load("./model1.pt").to(device) +############################################################################# +features_files = "./datasets/features1" +files = [ os.path.join(features_files, a) for a in sorted(os.listdir(features_files), + key=lambda x: (x[4:])) ] + +zdfs = [] +for file in files: + print(file) + epoch_losses = [] + + d = pd.read_csv(file) + a = DatasetsDay(d,day_length=seq_length) + x = DatasetsDay.get_last_item() + x = x.to(device) + outputs = m(x) + cls = outputs.argmax() + zdf = cls / 1000 - 0.3 + 1 + print(zdf) + zdfs.append(zdf) + diff --git a/t1_create_sample.py b/t1_create_sample.py new file mode 100644 index 0000000..2946758 --- /dev/null +++ b/t1_create_sample.py @@ -0,0 +1,33 @@ +import os +import pandas as pd + +def create_1(): + features_dir="./datasets/features" + files = [a for a in sorted(os.listdir(features_dir), key=lambda x: str(x[5:]))] + for file in files: + print(file) + d = pd.read_csv(os.path.join(features_dir, file)) + + a = pd.DataFrame() + a = d.iloc[0:51][:] + + p = os.path.join(features_dir, "test"+file) + a.to_csv(p) + + + +# create_1() + + +def create_2(): + code = "sz.300001.csv" + file="./datasets/features/" + code + d = pd.read_csv(file) + + a = pd.DataFrame() + a = d.iloc[0:52][:] + + p = os.path.join("./datasets/feature1", code) + a.to_csv(p) + +create_2() \ No newline at end of file diff --git a/z-predict.py b/z-predict.py new file mode 100644 index 0000000..e69de29 diff --git a/zutils.py b/zutils.py new file mode 100644 index 0000000..37cfded --- /dev/null +++ b/zutils.py @@ -0,0 +1,179 @@ +import pandas as pd +from bson.json_util import dumps +from io import StringIO +import json +from datetime import datetime +import time + + +def download_to_column(df): + ################################################################## + # df = stock_hfq_df + # aa = df.loc[:, "收盘"] + # df["前收盘"] = aa.shift() + # df.at[0, "前收盘"] = df.at[0, "收盘"] + # + # tt = df.loc[:, "换手率"] + # df["前换手率"] = tt.shift() + # df.at[0, "前换手率"] = df.at[0, "换手率"] + + # df["换手率"] = df["换手率"].astype(float) + # df["前换手率"] = df["换手率"].pct_change() + # df["收盘"] = df["收盘"].astype(float) + # df["前收盘"] = df["收盘"].pct_change() + ################################################################## + stock = pd.DataFrame() + # stock["d"] = time.strftime("%Y-%m-%d", time.localtime(df.loc[:, "日期"])) + stock["dateT"] = df.loc[:, "日期"].astype(str) + stock["open"] = df.loc[:, "开盘"].astype(float) + stock["close"] = df.loc[:, "收盘"].astype(float) + # stock["pre_close"] = df.loc[:, "前收盘"] + stock["high"] = df.loc[:, "最高"].astype(float) + stock["low"] = df.loc[:, "最低"].astype(float) + stock["turnover"] = df.loc[:, "换手率"].astype(float) + # stock['pre_turnover'] = df.loc[:, "前换手率"] + stock["zf"] = df.loc[:, "振幅"].astype(float) + stock["zdf"] = df.loc[:, "涨跌幅"].astype(float) + # # kdj9(stock) + # # kdj45(stock) + # # ma(stock) + # data = stock.replace(np.nan, 0) + # data = data.replace(np.inf, 0) + # data = data.fillna(0) + + return stock + + +def jsonl_2_dp(jsonl): + json_data = dumps(jsonl, indent=2) + pd_stocks = pd.read_json(StringIO(json_data)) + return pd_stocks + + +def dp_2_jsonl(dataset): + docs = dataset.to_json(orient="records") + docs = json.loads(docs) + # collection.insert_many(docs) + return docs + + +def timestamp2time(timestamp=1707769336): + dateArray = datetime.datetime.fromtimestamp(timestamp) + # otherStyleTime = dateArray.strftime("%Y-%m-%d %H:%M:%S") + otherStyleTime = dateArray.strftime("%Y-%m-%d") + print(otherStyleTime) + return otherStyleTime + + +def time2timestamp(t="2024-02-13 04:22:16"): + # 字符类型的时间 + # 转为时间数组 + timeArray = time.strptime(t, "%Y-%m-%d %H:%M:%S") + print(timeArray) + # timeArray可以调用tm_year等 + # 转为时间戳 + timeStamp = int(time.mktime(timeArray)) + print(timeStamp) + return timeStamp + + +###################################################### +###################################################### +###################################################### +def data_clean(df): + return + +###################################################### +###################################################### +def trend_high_price(df): + # 往前追溯3个高点,如果只有一个, 则另外两个和第一个相等。 + # df["high"] = df["high"].astype(float) + ph = [] + pl = [] + for i in range(len(df)): + h = df.iloc[i]["high"] + l = df.iloc[i]["low"] + if len(ph) < 1: + ph.append(h) + pl.append(l) + df.iloc[i]["ph"] = h + df.iloc[i]["pl"] = l + continue + + ma20 = df.iloc[i]["ma20"] + ma60 = df.iloc[i]["ma60"] + if ma20 > ma60: + r = all(v > h for v in ph) + if not r: + pre_v = -1 + for v in reversed(ph): + if v > h: + pre_v = v + if pre_v > 0: + ph.append(pre_v) + else: + ph.append(h) + else: + r = all(v < h for v in pl) + if not r: + pre_v = -1 + for v in reversed(pl): + if v < h: + pre_v = v + if pre_v > 0: + ph.append(pre_v) + else: + ph.append(l) + + df.iloc[i]["ph"] = ph[:-1] + df.iloc[i]["pl"] = pl[:-1] + + return df + + +def trend_high_price2(df): + # 往前追溯3个高点,如果只有一个, 则另外两个和第一个相等。 + # df["high"] = df["high"].astype(float) + ph = [] + pl = [] + h_i = 0 + l_i = 0 + for i in range(len(df)): + h = df.iloc[i]["high"] + l = df.iloc[i]["low"] + if len(ph) < 1: + ph.append(h) + pl.append(l) + df.iloc[i]["ph"] = h + df.iloc[i]["pl"] = l + continue + c = df.iloc[i]["close"] + ma20 = df.iloc[i]["ma20"] + ma60 = df.iloc[i]["ma60"] + if c > ma20: + r = all(v > h for v in ph) + if not r: + pre_v = -1 + for v in reversed(ph): + if v > h: + pre_v = v + if pre_v > 0: + ph.append(pre_v) + else: + ph.append(h) + else: + r = all(v < h for v in pl) + if not r: + pre_v = -1 + for v in reversed(pl): + if v < h: + pre_v = v + if pre_v > 0: + ph.append(pre_v) + else: + ph.append(l) + + df.iloc[i]["ph"] = ph[:-1] + df.iloc[i]["pl"] = pl[:-1] + + return df