Merge pull request #300 from OpenAccess-AI-Collective/pytorch-201
Browse files
.github/workflows/base.yml
CHANGED
|
@@ -18,12 +18,12 @@ jobs:
|
|
| 18 |
- cuda: "118"
|
| 19 |
cuda_version: 11.8.0
|
| 20 |
python_version: "3.9"
|
| 21 |
-
pytorch: 2.0.
|
| 22 |
axolotl_extras:
|
| 23 |
- cuda: "118"
|
| 24 |
cuda_version: 11.8.0
|
| 25 |
python_version: "3.10"
|
| 26 |
-
pytorch: 2.0.
|
| 27 |
axolotl_extras:
|
| 28 |
- cuda: "117"
|
| 29 |
cuda_version: 11.7.1
|
|
@@ -33,7 +33,7 @@ jobs:
|
|
| 33 |
- cuda: "118"
|
| 34 |
cuda_version: 11.8.0
|
| 35 |
python_version: "3.9"
|
| 36 |
-
pytorch: 2.0.
|
| 37 |
axolotl_extras: gptq
|
| 38 |
steps:
|
| 39 |
- name: Checkout
|
|
|
|
| 18 |
- cuda: "118"
|
| 19 |
cuda_version: 11.8.0
|
| 20 |
python_version: "3.9"
|
| 21 |
+
pytorch: 2.0.1
|
| 22 |
axolotl_extras:
|
| 23 |
- cuda: "118"
|
| 24 |
cuda_version: 11.8.0
|
| 25 |
python_version: "3.10"
|
| 26 |
+
pytorch: 2.0.1
|
| 27 |
axolotl_extras:
|
| 28 |
- cuda: "117"
|
| 29 |
cuda_version: 11.7.1
|
|
|
|
| 33 |
- cuda: "118"
|
| 34 |
cuda_version: 11.8.0
|
| 35 |
python_version: "3.9"
|
| 36 |
+
pytorch: 2.0.1
|
| 37 |
axolotl_extras: gptq
|
| 38 |
steps:
|
| 39 |
- name: Checkout
|
.github/workflows/main.yml
CHANGED
|
@@ -17,17 +17,17 @@ jobs:
|
|
| 17 |
- cuda: cu118
|
| 18 |
cuda_version: 11.8.0
|
| 19 |
python_version: "3.9"
|
| 20 |
-
pytorch: 2.0.
|
| 21 |
axolotl_extras:
|
| 22 |
- cuda: cu118
|
| 23 |
cuda_version: 11.8.0
|
| 24 |
python_version: "3.10"
|
| 25 |
-
pytorch: 2.0.
|
| 26 |
axolotl_extras:
|
| 27 |
- cuda: cu118
|
| 28 |
cuda_version: 11.8.0
|
| 29 |
python_version: "3.9"
|
| 30 |
-
pytorch: 2.0.
|
| 31 |
axolotl_extras: gptq
|
| 32 |
- cuda: cu117
|
| 33 |
cuda_version: 11.7.1
|
|
@@ -72,17 +72,17 @@ jobs:
|
|
| 72 |
- cuda: cu118
|
| 73 |
cuda_version: 11.8.0
|
| 74 |
python_version: "3.9"
|
| 75 |
-
pytorch: 2.0.
|
| 76 |
axolotl_extras:
|
| 77 |
- cuda: cu118
|
| 78 |
cuda_version: 11.8.0
|
| 79 |
python_version: "3.10"
|
| 80 |
-
pytorch: 2.0.
|
| 81 |
axolotl_extras:
|
| 82 |
- cuda: cu118
|
| 83 |
cuda_version: 11.8.0
|
| 84 |
python_version: "3.9"
|
| 85 |
-
pytorch: 2.0.
|
| 86 |
axolotl_extras: gptq
|
| 87 |
- cuda: cu117
|
| 88 |
cuda_version: 11.7.1
|
|
|
|
| 17 |
- cuda: cu118
|
| 18 |
cuda_version: 11.8.0
|
| 19 |
python_version: "3.9"
|
| 20 |
+
pytorch: 2.0.1
|
| 21 |
axolotl_extras:
|
| 22 |
- cuda: cu118
|
| 23 |
cuda_version: 11.8.0
|
| 24 |
python_version: "3.10"
|
| 25 |
+
pytorch: 2.0.1
|
| 26 |
axolotl_extras:
|
| 27 |
- cuda: cu118
|
| 28 |
cuda_version: 11.8.0
|
| 29 |
python_version: "3.9"
|
| 30 |
+
pytorch: 2.0.1
|
| 31 |
axolotl_extras: gptq
|
| 32 |
- cuda: cu117
|
| 33 |
cuda_version: 11.7.1
|
|
|
|
| 72 |
- cuda: cu118
|
| 73 |
cuda_version: 11.8.0
|
| 74 |
python_version: "3.9"
|
| 75 |
+
pytorch: 2.0.1
|
| 76 |
axolotl_extras:
|
| 77 |
- cuda: cu118
|
| 78 |
cuda_version: 11.8.0
|
| 79 |
python_version: "3.10"
|
| 80 |
+
pytorch: 2.0.1
|
| 81 |
axolotl_extras:
|
| 82 |
- cuda: cu118
|
| 83 |
cuda_version: 11.8.0
|
| 84 |
python_version: "3.9"
|
| 85 |
+
pytorch: 2.0.1
|
| 86 |
axolotl_extras: gptq
|
| 87 |
- cuda: cu117
|
| 88 |
cuda_version: 11.7.1
|
docker/Dockerfile-base
CHANGED
|
@@ -38,8 +38,9 @@ WORKDIR /workspace
|
|
| 38 |
|
| 39 |
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
| 40 |
|
| 41 |
-
RUN git clone https://github.com/
|
| 42 |
cd flash-attention && \
|
|
|
|
| 43 |
python3 setup.py bdist_wheel && \
|
| 44 |
cd csrc/fused_dense_lib && \
|
| 45 |
python3 setup.py bdist_wheel && \
|
|
|
|
| 38 |
|
| 39 |
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
| 40 |
|
| 41 |
+
RUN git clone https://github.com/Dao-AILab/flash-attention.git && \
|
| 42 |
cd flash-attention && \
|
| 43 |
+
git checkout v1.0.9 && \
|
| 44 |
python3 setup.py bdist_wheel && \
|
| 45 |
cd csrc/fused_dense_lib && \
|
| 46 |
python3 setup.py bdist_wheel && \
|
src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
CHANGED
|
@@ -184,14 +184,15 @@ def sdp_attention_forward(
|
|
| 184 |
|
| 185 |
# We only apply sdp attention if we don't need to output the whole attention matrix
|
| 186 |
if not output_attentions:
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
|
|
|
| 195 |
else:
|
| 196 |
attn_weights = torch.matmul(
|
| 197 |
query_states, key_states.transpose(2, 3)
|
|
|
|
| 184 |
|
| 185 |
# We only apply sdp attention if we don't need to output the whole attention matrix
|
| 186 |
if not output_attentions:
|
| 187 |
+
with torch.backends.cuda.sdp_kernel():
|
| 188 |
+
attn_output = torch.nn.functional.scaled_dot_product_attention(
|
| 189 |
+
query_states,
|
| 190 |
+
key_states,
|
| 191 |
+
value_states,
|
| 192 |
+
attn_mask=attention_mask,
|
| 193 |
+
is_causal=False,
|
| 194 |
+
)
|
| 195 |
+
attn_weights = None
|
| 196 |
else:
|
| 197 |
attn_weights = torch.matmul(
|
| 198 |
query_states, key_states.transpose(2, 3)
|