commit | author | age
|
785be8
|
1 |
# --- |
573ee0
|
2 |
- name: GPU installation and configuration |
EMJ |
3 |
hosts: bastions |
|
4 |
become: true |
|
5 |
gather_facts: false |
|
6 |
vars_files: |
|
7 |
- "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_vars.yml" |
|
8 |
- "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_secret_vars.yml" |
|
9 |
tags: |
|
10 |
- step005.1 |
|
11 |
- gpu_install_config |
|
12 |
tasks: |
795ea4
|
13 |
- name: pre-pull relevant images |
EMJ |
14 |
command: docker pull {{ item }} |
|
15 |
with_items: |
|
16 |
- "{{ cache_images }}" |
|
17 |
|
573ee0
|
18 |
- name: clone the psap repository onto the bastion |
EMJ |
19 |
git: |
|
20 |
repo: 'https://github.com/thoraxe/openshift-psap.git' |
|
21 |
dest: '/root/openshift-psap' |
09c57d
|
22 |
version: 'ocp-311-0.7' |
573ee0
|
23 |
force: yes |
EMJ |
24 |
update: yes |
|
25 |
|
|
26 |
- name: install the nvidia driver using the playbook |
7f902e
|
27 |
command: '/bin/ansible-playbook -i /etc/ansible/hosts -e hosts_to_apply="masters" -e cuda_driver_repo_base_url="http://admin.na.shared.opentlc.com/repos/nvidia/" -e cuda_driver_repo_suffix="cuda/" /root/openshift-psap/playbooks/nvidia-driver-install.yaml' |
573ee0
|
28 |
|
EMJ |
29 |
- name: install the nvidia runtime hook using the playbook |
7f902e
|
30 |
command: '/bin/ansible-playbook -i /etc/ansible/hosts -e hosts_to_apply="masters" -e nvidia_container_hook_repo_base_url="http://admin.na.shared.opentlc.com/repos/nvidia/" -e libnvidia_container_suffix="libnvidia-container" -e libnvidia_container_repo_gpgcheck="false" -e nvidia_container_runtime_suffix="nvidia-container-runtime" -e nvidia_container_runtime_repo_gpgcheck="false" /root/openshift-psap/playbooks/nvidia-container-runtime-hook.yaml' |
573ee0
|
31 |
|
EMJ |
32 |
- name: install the nvidia device plugin using the playbook |
|
33 |
command: '/bin/ansible-playbook -i /etc/ansible/hosts -e hosts_to_apply="masters" -e gpu_hosts="nodes" /root/openshift-psap/playbooks/nvidia-device-plugin.yaml' |
|
34 |
|
|
35 |
- name: Pre-flight GPU checks |
|
36 |
hosts: bastions |
|
37 |
become: true |
|
38 |
gather_facts: false |
|
39 |
vars_files: |
|
40 |
- "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_vars.yml" |
|
41 |
- "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_secret_vars.yml" |
|
42 |
tags: |
|
43 |
- step005.2 |
|
44 |
- gpu_preflight |
|
45 |
tasks: |
|
46 |
# if we do something multi-node this will have to change to look for the total number of gpu nodes |
|
47 |
- name: Check that the daemonset is ready |
ecfb11
|
48 |
command: oc get daemonset -n kube-system nvidia-deviceplugin-daemonset -o jsonpath --template='{.status.numberReady}' |
573ee0
|
49 |
register: daemonset_ready_out |
EMJ |
50 |
until: 'daemonset_ready_out.stdout | int >= 1' |
|
51 |
retries: 5 |
|
52 |
delay: 60 |
|
53 |
|
|
54 |
# if we do something multi-node this will have to change |
|
55 |
- name: Check that the node reports capacity |
|
56 |
command: oc get node {{ item }} -o jsonpath --template='{.status.capacity.nvidia\.com/gpu}' |
|
57 |
register: node_gpu_capacity_out |
|
58 |
failed_when: '"1" not in node_gpu_capacity_out.stdout' |
|
59 |
with_items: '{{ groups["masters"] }}' |
|
60 |
|
|
61 |
- name: Delete cuda vector pod in case it was already created |
|
62 |
command: oc delete pod cuda-vector-add -n nvidia-device-plugin |
|
63 |
ignore_errors: true |
|
64 |
|
|
65 |
- name: Deploy the cuda vector pod |
|
66 |
shell: 'cat /root/openshift-psap/blog/gpu/device-plugin/cuda-vector-add.yaml | grep -v "namespace: nvidia" | oc create -n nvidia-device-plugin -f -' |
|
67 |
|
|
68 |
- name: Wait for pod completion |
|
69 |
command: oc get pod -n nvidia-device-plugin cuda-vector-add -o jsonpath --template='{.status.phase}' |
|
70 |
register: cuda_status_out |
|
71 |
until: '"Succeeded" in cuda_status_out.stdout' |
|
72 |
retries: 5 |
|
73 |
delay: 60 |
|
74 |
|
|
75 |
- name: Double check cuda pod success |
|
76 |
command: oc logs -n nvidia-device-plugin cuda-vector-add |
|
77 |
register: cuda_pod_out |
|
78 |
failed_when: '"Test PASSED" not in cuda_pod_out.stdout' |
|
79 |
|
|
80 |
- name: User administration |
|
81 |
hosts: bastions |
|
82 |
become: true |
|
83 |
gather_facts: false |
|
84 |
vars_files: |
|
85 |
- "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_vars.yml" |
|
86 |
- "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_secret_vars.yml" |
|
87 |
tags: |
|
88 |
- step005.3 |
|
89 |
- user_admin |
|
90 |
tasks: |
|
91 |
- name: Give the gpu-user cluster admin privileges |
|
92 |
command: oc adm policy add-cluster-role-to-user cluster-admin gpu-user |
|
93 |
|
b7fc0f
|
94 |
- name: PostSoftware flight-check |
EMJ |
95 |
hosts: localhost |
|
96 |
connection: local |
|
97 |
gather_facts: false |
|
98 |
become: false |
|
99 |
vars_files: |
|
100 |
- "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_vars.yml" |
|
101 |
tags: |
|
102 |
- post_flight_check |
|
103 |
tasks: |
|
104 |
- debug: |
|
105 |
msg: "Post-Software checks completed successfully" |