Skip to content

Commit bea5115

Browse files
authored
llama:llama模块拆分 (#38)
* py:移除inplace操作,交给编译器 * py:functional的mean、swish、sigmod、softmax验证 * indexselect:修复了cuda indexselect的问题,验证了nn.embedding * to:精度转换算子 * to:精度转换算子 * llama:llama模块拆分 * leaffunc:新增dropout
1 parent f9fb34d commit bea5115

File tree

73 files changed

+2099
-2254
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+2099
-2254
lines changed

doc/excuter/op-mem-cuda/list.md

+74-73
Large diffs are not rendered by default.

doc/excuter/op-mem-ompsimd/list.md

+72-71
Large diffs are not rendered by default.

excuter/cpp-common/src/deepx/shape.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ namespace deepx
6666

6767
// rangeParallel 支持omp,但omp内无需线程local变量
6868
void rangeParallel(int dimCount, std::function<void(const std::vector<int> &indices)> func) const;
69-
void rangeParallel(int dimCount, std::function<void(const int idx_linear)> func) const;
69+
void rangeElementwiseParallel( std::function<void(const int idx_linear,const int idx_linear_end)> func) const;
7070
void rangeParallel(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices)> func) const;
7171

7272
// 支持omp,但omp内需要线程local变量

excuter/cpp-common/src/deepx/shape_range.cpp

+18-11
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <vector>
33
#include <functional>
44
#include <any>
5+
#include <thread>
56

67
#include <omp.h>
78
#include "deepx/shape.hpp"
@@ -113,18 +114,24 @@ namespace deepx
113114
}
114115
}
115116
}
116-
void Shape::rangeParallel(int dimCount, std::function<void(const int idx_linear)> func) const
117-
{
118-
dimCount = checkdim(dimCount, dim());
119-
int stride = checkStride(dimCount, shape);
120-
121-
// 计算总循环次数
122-
int total = size / stride;
123-
124-
#pragma omp parallel for
125-
for (int idx = 0; idx < total; idx++)
117+
void Shape::rangeElementwiseParallel(std::function<void(const int idx_linear,const int idx_linear_end)> func) const
118+
{
119+
int num_threads = std::thread::hardware_concurrency();
120+
int alignblock=size/num_threads;
121+
const int minblock=256;
122+
if (alignblock<minblock)
126123
{
127-
func(idx * stride);
124+
alignblock=minblock;
125+
num_threads=size/alignblock;
126+
}
127+
#pragma omp parallel for num_threads(num_threads)
128+
for (int idx = 0; idx < size; idx+=alignblock)
129+
{
130+
int end = idx + alignblock;
131+
if (end > size) {
132+
end = size;
133+
}
134+
func(idx,end);
128135
}
129136
}
130137

excuter/cpp-common/src/deepx/tensor.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ namespace deepx
2626
DeleteFn deleter; // 释放内存
2727

2828
using CopyFn = void (*)(T *, T *, int);
29+
//copyer(src, dest, size)
2930
CopyFn copyer; // 拷贝内存
3031

3132
using SaveFn = void (*)(T *,size_t,const std::string &);

excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp

+4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66

77
namespace deepx::tensorfunc
88
{
9+
//todtype
10+
template <typename T,typename Dtype>
11+
void todtype(const Tensor<T> &input, Tensor<Dtype> &output);
12+
913
template <typename Author, typename T>
1014
struct addDispatcher
1115
{

excuter/cpp-common/src/deepx/tf/tffactory.cpp

+3-4
Original file line numberDiff line numberDiff line change
@@ -103,15 +103,14 @@ namespace deepx::tf
103103
// 为每个tftype生成一个表格
104104
for (const auto &[tftype, tfs] : tf_by_type) {
105105
ss << "### " << tftype << "\n\n";
106-
ss << "| Operation | Author | Func Def | Math Formula | IR Instruction |\n";
107-
ss << "|-----------|--------|------------|--------------|----------------|\n";
106+
ss << "| Operation | Author | Math Formula | IR Instruction |\n";
107+
ss << "|-----------|--------|--------------|----------------|\n";
108108

109109
for (const auto &tf : tfs) {
110110
ss << "| " << tf->name << " | ";
111111
ss << (tf->metadata.author.empty() ? " none " : tf->metadata.author) << " | ";
112-
ss << tf->to_string(false, true) << " | ";
113112
ss << tf->math_formula() << " | ";
114-
ss << tf->to_string(false, true) << " |\n";
113+
ss << stdutil::escape_markdown(tf->to_string(false, true)) << " |\n";
115114
}
116115

117116
ss << "\n";

excuter/cpp-common/src/stdutil/string.cpp

+41-1
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,49 @@ namespace stdutil
88
str.erase(str.find_last_not_of(" ") + 1);
99
}
1010

11-
void trim(string &str,const string &chars)
11+
void trim(string &str, const string &chars)
1212
{
1313
str.erase(0, str.find_first_not_of(chars));
1414
str.erase(str.find_last_not_of(chars) + 1);
1515
}
16+
17+
string escape_markdown(const string &str)
18+
{
19+
std::string result;
20+
for (char c : str)
21+
{
22+
switch (c)
23+
{
24+
case '\\':
25+
result += "\\\\";
26+
break;
27+
case '\"':
28+
result += "\\\"";
29+
break;
30+
case '\'':
31+
result += "\\\'";
32+
break;
33+
case '\n':
34+
result += "\\n";
35+
break;
36+
case '\t':
37+
result += "\\t";
38+
break;
39+
case '\r':
40+
result += "\\r";
41+
break;
42+
case '\b':
43+
result += "\\b";
44+
break;
45+
case '\f':
46+
result += "\\f";
47+
break;
48+
default:
49+
// 普通字符直接添加
50+
result += c;
51+
}
52+
}
53+
return result;
54+
}
55+
1656
} // namespace stdutil

excuter/cpp-common/src/stdutil/string.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ namespace stdutil
1010
void trimspace(string &str);
1111
void trim(string &str,const string &chars=" \t\n\r\f\v");
1212

13+
string escape_markdown(const string &str);
1314
}
1415

1516

excuter/op-mem-cuda/src/client/tfs.cpp

+12
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,18 @@ namespace deepx::tf
164164
// elementwise
165165
void register_elementwise(TfFactory &tffactory)
166166
{
167+
//todtype
168+
tffactory.add_tf(std::make_shared<Todtype>(vector<Param>(
169+
{
170+
Param("a", DataCategory::Tensor, Precision::Any),
171+
}),
172+
vector<Param>(
173+
{
174+
Param("b", DataCategory::Tensor, Precision::Any),
175+
})));
176+
177+
178+
// add
167179
tffactory.add_tf(std::make_shared<Add<miaobyte>>(vector<Param>(
168180
{
169181
Param("a", DataCategory::Tensor, Precision::Any),

0 commit comments

Comments
 (0)