array2d
diff --git a/‎doc/excuter/op-mem-cuda/list.md
+74-73 b/‎doc/excuter/op-mem-cuda/list.md
+74-73
diff --git a/‎doc/excuter/op-mem-ompsimd/list.md
+72-71 b/‎doc/excuter/op-mem-ompsimd/list.md
+72-71
diff --git a/‎excuter/cpp-common/src/deepx/shape.hpp
+1-1 b/‎excuter/cpp-common/src/deepx/shape.hpp
+1-1
diff --git a/‎excuter/cpp-common/src/deepx/shape_range.cpp
+18-11 b/‎excuter/cpp-common/src/deepx/shape_range.cpp
+18-11
diff --git a/‎excuter/cpp-common/src/deepx/tensor.hpp
+1 b/‎excuter/cpp-common/src/deepx/tensor.hpp
+1
diff --git a/‎excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
+4 b/‎excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
+4
diff --git a/‎excuter/cpp-common/src/deepx/tf/tffactory.cpp
+3-4 b/‎excuter/cpp-common/src/deepx/tf/tffactory.cpp
+3-4
diff --git a/‎excuter/cpp-common/src/stdutil/string.cpp
+41-1 b/‎excuter/cpp-common/src/stdutil/string.cpp
+41-1
diff --git a/‎excuter/cpp-common/src/stdutil/string.hpp
+1 b/‎excuter/cpp-common/src/stdutil/string.hpp
+1
diff --git a/‎excuter/op-mem-cuda/src/client/tfs.cpp
+12 b/‎excuter/op-mem-cuda/src/client/tfs.cpp
+12
@@ -66,7 +66,7 @@ namespace deepx
 
         // rangeParallel 支持omp,但omp内无需线程local变量
         void rangeParallel(int dimCount, std::function<void(const std::vector<int> &indices)> func) const;
-        void rangeParallel(int dimCount, std::function<void(const int idx_linear)> func) const;
+        void rangeElementwiseParallel( std::function<void(const int idx_linear,const int idx_linear_end)> func) const;
         void rangeParallel(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices)> func) const;
 
         // 支持omp,但omp内需要线程local变量
 
@@ -2,6 +2,7 @@
 #include <vector>
 #include <functional>
 #include <any>
+#include <thread>
 
 #include <omp.h>
 #include "deepx/shape.hpp"
@@ -113,18 +114,24 @@ namespace deepx
             }
         }
     }
-    void Shape::rangeParallel(int dimCount, std::function<void(const int idx_linear)> func) const
-    {
-        dimCount = checkdim(dimCount, dim());
-        int stride = checkStride(dimCount, shape);
-
-        // 计算总循环次数
-        int total = size / stride;
-
-#pragma omp parallel for
-        for (int idx = 0; idx < total; idx++)
+    void Shape::rangeElementwiseParallel(std::function<void(const int idx_linear,const int idx_linear_end)> func) const
+    {   
+        int num_threads =  std::thread::hardware_concurrency();
+        int alignblock=size/num_threads;
+        const int minblock=256;
+        if (alignblock<minblock)
         {
-            func(idx * stride);
+            alignblock=minblock;
+            num_threads=size/alignblock;
+        }
+        #pragma omp parallel for num_threads(num_threads)
+        for (int idx = 0; idx < size; idx+=alignblock)
+        {
+            int end = idx + alignblock;
+            if (end > size) {
+                end = size;
+            }
+            func(idx,end);
         }
     }
 
 
@@ -26,6 +26,7 @@ namespace deepx
         DeleteFn deleter; // 释放内存
 
         using CopyFn = void (*)(T *, T *, int);
+        //copyer(src, dest, size)
         CopyFn copyer; // 拷贝内存
 
         using SaveFn = void (*)(T *,size_t,const std::string &);
 
@@ -6,6 +6,10 @@
 
 namespace deepx::tensorfunc
 {
+    //todtype
+    template <typename T,typename Dtype>
+    void todtype(const Tensor<T> &input, Tensor<Dtype> &output);
+ 
     template <typename Author, typename T>
     struct addDispatcher
     {
 
@@ -103,15 +103,14 @@ namespace deepx::tf
         // 为每个tftype生成一个表格
         for (const auto &[tftype, tfs] : tf_by_type) {
             ss << "### " << tftype << "\n\n";
-            ss << "| Operation | Author | Func Def | Math Formula | IR Instruction |\n";
-            ss << "|-----------|--------|------------|--------------|----------------|\n";
+            ss << "| Operation | Author |  Math Formula | IR Instruction |\n";
+            ss << "|-----------|--------|--------------|----------------|\n";
 
             for (const auto &tf : tfs) {
                 ss << "| " << tf->name << " | ";
                 ss << (tf->metadata.author.empty() ? " none " : tf->metadata.author) << " | ";
-                ss << tf->to_string(false, true) << " | ";
                 ss << tf->math_formula() << " | ";
-                ss << tf->to_string(false, true) << " |\n";
+                ss << stdutil::escape_markdown(tf->to_string(false, true)) << " |\n";
             }
 
             ss << "\n";
 
@@ -8,9 +8,49 @@ namespace stdutil
         str.erase(str.find_last_not_of(" ") + 1);
     }
 
-    void trim(string &str,const string &chars)
+    void trim(string &str, const string &chars)
     {
         str.erase(0, str.find_first_not_of(chars));
         str.erase(str.find_last_not_of(chars) + 1);
     }
+
+    string escape_markdown(const string &str)
+    {
+        std::string result;
+        for (char c : str)
+        {
+            switch (c)
+            {
+            case '\\':
+                result += "\\\\";
+                break;
+            case '\"':
+                result += "\\\"";
+                break;
+            case '\'':
+                result += "\\\'";
+                break;
+            case '\n':
+                result += "\\n";
+                break;
+            case '\t':
+                result += "\\t";
+                break;
+            case '\r':
+                result += "\\r";
+                break;
+            case '\b':
+                result += "\\b";
+                break;
+            case '\f':
+                result += "\\f";
+                break;
+            default:
+                // 普通字符直接添加
+                result += c;
+            }
+        }
+        return result;
+    }
+
 } // namespace stdutil
@@ -10,6 +10,7 @@ namespace stdutil
     void trimspace(string &str);
     void trim(string &str,const string &chars=" \t\n\r\f\v");
 
+    string escape_markdown(const string &str);
 }
 
 
 
@@ -164,6 +164,18 @@ namespace deepx::tf
     // elementwise
     void register_elementwise(TfFactory &tffactory)
     {
+        //todtype
+        tffactory.add_tf(std::make_shared<Todtype>(vector<Param>(
+                                                                 {
+                                                                     Param("a", DataCategory::Tensor, Precision::Any),
+                                                                 }),
+                                                             vector<Param>(
+                                                                 {
+                                                                     Param("b", DataCategory::Tensor, Precision::Any),
+                                                                 })));
+
+
+        // add
         tffactory.add_tf(std::make_shared<Add<miaobyte>>(vector<Param>(
                                                              {
                                                                  Param("a", DataCategory::Tensor, Precision::Any),
Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,10 @@`
`6`	`6`
`7`	`7`	`namespace deepx::tensorfunc`
`8`	`8`	`{`
	`9`	`+ //todtype`
	`10`	`+ template <typename T,typename Dtype>`
	`11`	`+ void todtype(const Tensor<T> &input, Tensor<Dtype> &output);`
	`12`	`+`
`9`	`13`	`template <typename Author, typename T>`
`10`	`14`	`struct addDispatcher`
`11`	`15`	`{`
Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@ namespace stdutil`
`10`	`10`	`void trimspace(string &str);`
`11`	`11`	`void trim(string &str,const string &chars=" \t\n\r\f\v");`
`12`	`12`
	`13`	`+ string escape_markdown(const string &str);`
`13`	`14`	`}`
`14`	`15`
`15`	`16`