feat(gpt): add sample autoencoder anomaly detection model, and some internal refactor. (#30227)

* fix(stream): support packaging enterprise edition. * feat(gpt): support lstm and do some internal refactor, add sample autoencoder model. * feat(gpt): support lstm and do some internal refactor, add sample autoencoder model. * test(gpt): disable model case. * test(gpt): disable model case. * doc: fix title error in doc.
2025-03-18 13:34:17 +08:00 · 2025-03-18 13:34:17 +08:00 · 8f89aec7cb
parent 394f64a5a5
commit 8f89aec7cb
9 changed files with 127 additions and 36 deletions
--- a/docs/zh/06-advanced/06-TDgpt/04-forecast/04-lstm.md
+++ b/docs/zh/06-advanced/06-TDgpt/04-forecast/04-lstm.md
@ -0,0 +1,31 @@
 ---
 title: "LSTM"
 sidebar_label: "LSTM"
 ---
 本节说明 LSTM 模型的使用方法。
 ## 功能概述
 LSTM模型即长短期记忆网络(Long Short Term Memory)，是一种特殊的循环神经网络，适用于处理时间序列数据、自然语言处理等任务，通过其独特的门控机制，能够有效捕捉长期依赖关系，
 解决传统RNN的梯度消失问题，从而对序列数据进行准确预测，不过它不直接提供计算的置信区间范围结果。
 完整的调用SQL语句如下：
 ```SQL
 SELECT _frowts, FORECAST(i32, "algo=lstm,alpha=95,period=10,start_p=1,max_p=5,start_q=1,max_q=5") from foo
 ```
 ```json5
 {
 "rows": fc_rows,  // 返回结果的行数
 "period": period, // 返回结果的周期性，同输入
 "alpha": alpha,   // 返回结果的置信区间，同输入
 "algo": "lstm",  // 返回结果使用的算法
 "mse": mse,       // 拟合输入时间序列时候生成模型的最小均方误差(MSE)
 "res": res        // 列模式的结果
 }
 ```
 ### 参考文献
 - [1] Hochreiter S. Long Short-term Memory[J]. Neural Computation MIT-Press, 1997.
--- a/docs/zh/06-advanced/06-TDgpt/05-anomaly-detection/04-machine-learning.md
+++ b/docs/zh/06-advanced/06-TDgpt/05-anomaly-detection/04-machine-learning.md
@ -3,7 +3,9 @@ title: "机器学习算法"
 sidebar_label: "机器学习算法"
 ---
-Autoencoder<sup>[1]</sup>: TDgpt 内置使用自编码器（Autoencoder）的异常检测算法，对周期性的时间序列数据具有较好的检测结果。使用该模型需要针对输入时序数据进行预训练，同时将训练完成的模型保存在到服务目录 `ad_autoencoder` 中，然后在 SQL 语句中指定调用该算法模型即可使用。
+Autoencoder<sup>[1]</sup>: TDgpt 内置使用自编码器（Autoencoder）的异常检测算法，
 对周期性的时间序列数据具有较好的检测结果。使用该模型需要针对输入时序数据进行预训练，
 同时将训练完成的模型保存在到服务目录 `ad_autoencoder` 中，然后在 SQL 语句中指定调用该算法模型即可使用。
 ```SQL
 --- 在 options 中增加 model 的名称，ad_autoencoder_foo， 针对 foo 数据集（表）训练的采用自编码器的异常检测模型进行异常检测
--- a/tools/tdgpt/model/sample-ad-autoencoder/sample-ad-autoencoder.info
+++ b/tools/tdgpt/model/sample-ad-autoencoder/sample-ad-autoencoder.info
--- a/tools/tdgpt/model/sample-ad-autoencoder/sample-ad-autoencoder.keras
+++ b/tools/tdgpt/model/sample-ad-autoencoder/sample-ad-autoencoder.keras
--- a/tools/tdgpt/taosanalytics/algo/ad/autoencoder.py
+++ b/tools/tdgpt/taosanalytics/algo/ad/autoencoder.py
@ -4,6 +4,7 @@
 import os.path
 import joblib
 import keras
 import numpy as np
 import pandas as pd
@ -13,8 +14,8 @@ from taosanalytics.util import create_sequences
 class _AutoEncoderDetectionService(AbstractAnomalyDetectionService):
-    name = 'ad_encoder'
+    name = 'sample_ad_model'
-    desc = "anomaly detection based on auto encoder"
+    desc = "sample anomaly detection model based on auto encoder"
    def __init__(self):
        super().__init__()
@ -25,7 +26,7 @@ class _AutoEncoderDetectionService(AbstractAnomalyDetectionService):
        self.threshold = None
        self.time_interval = None
        self.model = None
-        self.dir = 'ad_autoencoder'
+        self.dir = 'sample-ad-autoencoder'
        self.root_path = conf.get_model_directory()
@ -61,11 +62,6 @@ class _AutoEncoderDetectionService(AbstractAnomalyDetectionService):
        # Detect all the samples which are anomalies.
        anomalies = mae > self.threshold
        # syslogger.log_inst(
        #     "Number of anomaly samples: %f, Indices of anomaly samples:{}".
        #     format(np.sum(anomalies), np.where(anomalies))
        # )
        # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
        ad_indices = []
        for data_idx in range(self.time_interval - 1,
@ -82,13 +78,13 @@ class _AutoEncoderDetectionService(AbstractAnomalyDetectionService):
        name = params['model']
-        module_file_path = f'{self.root_path}/{name}.dat'
+        module_file_path = f'{self.root_path}/{name}.keras'
        module_info_path = f'{self.root_path}/{name}.info'
        app_logger.log_inst.info("try to load module:%s", module_file_path)
        if os.path.exists(module_file_path):
-            self.model = joblib.load(module_file_path)
+            self.model = keras.models.load_model(module_file_path)
        else:
            app_logger.log_inst.error("failed to load autoencoder model file: %s", module_file_path)
            raise FileNotFoundError(f"{module_file_path} not found")
--- a/tools/tdgpt/taosanalytics/algo/fc/arima.py
+++ b/tools/tdgpt/taosanalytics/algo/fc/arima.py
@ -68,24 +68,6 @@ class _ArimaService(AbstractForecastService):
        fc = model.predict(n_periods=fc_rows, return_conf_int=self.return_conf,
                           alpha=self.conf)
        # plt.plot(source_data, label='training')
        # plt.plot(xrange, actual_data, label='actual')
        # fc_list = fc.tolist()
        # fc_without_diff = restore_from_diff(self.list, fc_list, 2)
        # print(fc_without_diff)
        # plt.plot(xrange, fc_without_diff, label='fc')
        # residuals = pd.DataFrame(model.arima_res_.resid)
        # wn = is_white_noise(residuals)
        # print("residual is white noise:", wn)
        # fig, ax = plt.subplots(1, 2)
        # residuals.plot(title="Residuals", ax=ax[0])
        # residuals.plot(kind='kde', title='Density', ax=ax[1])
        # plt.show()
        res1 = [fc[0].tolist(), fc[1][:, 0].tolist(),
                fc[1][:, 1].tolist()] if self.return_conf else [fc.tolist()]
--- a/tools/tdgpt/taosanalytics/algo/fc/lstm.py
+++ b/tools/tdgpt/taosanalytics/algo/fc/lstm.py
@ -0,0 +1,81 @@
 # encoding:utf-8
 # pylint: disable=c0103
 """ auto encoder algorithms to detect anomaly for time series data"""
 import os.path
 import keras
 from taosanalytics.algo.forecast import insert_ts_list
 from taosanalytics.conf import app_logger, conf
 from taosanalytics.service import AbstractForecastService
 class _LSTMService(AbstractForecastService):
    name = 'sample_forecast_model'
    desc = "sample forecast model based on LSTM"
    def __init__(self):
        super().__init__()
        self.table_name = None
        self.mean = None
        self.std = None
        self.threshold = None
        self.time_interval = None
        self.model = None
        self.dir = 'sample-fc-lstm'
        self.root_path = conf.get_model_directory()
        self.root_path = self.root_path + f'/{self.dir}/'
        if not os.path.exists(self.root_path):
            app_logger.log_inst.error(
                "%s ad algorithm failed to locate default module directory:"
                "%s, not active", self.__class__.__name__, self.root_path)
        else:
            app_logger.log_inst.info("%s ad algorithm root path is: %s", self.__class__.__name__,
                                     self.root_path)
    def execute(self):
        if self.input_is_empty():
            return []
        if self.model is None:
            raise FileNotFoundError("not load autoencoder model yet, or load model failed")
        res = self.model.predict(self.list)
        insert_ts_list(res, self.start_ts, self.time_step, self.fc_rows)
        if self.return_conf:
            res1 = [res.tolist(), res.tolist(), res.tolist()], None
        else:
            res1 = [res.tolist()], None
        # add the conf range if required
        return {
            "mse": None,
            "res": res1
        }
    def set_params(self, params):
        if "model" not in params:
            raise ValueError("model needs to be specified")
        name = params['model']
        module_file_path = f'{self.root_path}/{name}.keras'
        # module_info_path = f'{self.root_path}/{name}.info'
        app_logger.log_inst.info("try to load module:%s", module_file_path)
        if os.path.exists(module_file_path):
            self.model = keras.models.load_model(module_file_path)
        else:
            app_logger.log_inst.error("failed to load LSTM model file: %s", module_file_path)
            raise FileNotFoundError(f"{module_file_path} not found")
    def get_params(self):
        return {"dir": self.dir + '/*'}
--- a/tools/tdgpt/taosanalytics/test/anomaly_test.py
+++ b/tools/tdgpt/taosanalytics/test/anomaly_test.py
@ -142,14 +142,13 @@ class AnomalyDetectionTest(unittest.TestCase):
    def test_autoencoder_ad(self):
        """for local test only, disabled it in github action"""
        pass 
        # data = self.__load_remote_data_for_ad()
        #
-        # s = loader.get_service("ad_encoder")
+        # s = loader.get_service("sample_ad_model")
        # s.set_input_list(data)
        #
        # try:
-        #     s.set_params({"model": "ad_encoder_"})
+        #     s.set_params({"model": "sample-ad-autoencoder"})
        # except ValueError as e:
        #     app_logger.log_inst.error(f"failed to set the param for auto_encoder algorithm, reason:{e}")
        #     return
@ -157,9 +156,9 @@ class AnomalyDetectionTest(unittest.TestCase):
        # r = s.execute()
        #
        # num_of_error = -(sum(filter(lambda x: x == -1, r)))
        # self.assertEqual(num_of_error, 109)
        #
        # draw_ad_results(data, r, "autoencoder")
        #
        # self.assertEqual(num_of_error, 109)
    def test_get_all_services(self):
        """Test get all services"""
--- a/tools/tdgpt/taosanalytics/test/unit_test.py
+++ b/tools/tdgpt/taosanalytics/test/unit_test.py
@ -99,7 +99,7 @@ class ServiceTest(unittest.TestCase):
            if item["type"] == "anomaly-detection":
                self.assertEqual(len(item["algo"]), 6)
            else:
-                self.assertEqual(len(item["algo"]), 2)
+                self.assertEqual(len(item["algo"]), 3)
 if __name__ == '__main__':