Erik M Jacobs
2018-11-08 aa20d8eb63bcf55642ff6922725bc89ca53609af
commit | author | age
785be8 1 # ---
573ee0 2 - name: GPU installation and configuration
EMJ 3   hosts: bastions
4   become: true
5   gather_facts: false
6   vars_files:
7     - "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_vars.yml"
8     - "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_secret_vars.yml"
9   tags:
10     - step005.1
11     - gpu_install_config
12   tasks:
795ea4 13     - name: pre-pull relevant images
EMJ 14       command: docker pull {{ item }}
15       with_items:
16         - "{{ cache_images }}"
17
573ee0 18     - name: clone the psap repository onto the bastion
EMJ 19       git:
20         repo: 'https://github.com/thoraxe/openshift-psap.git'
21         dest: '/root/openshift-psap'
09c57d 22         version: 'ocp-311-0.7'
573ee0 23         force: yes
EMJ 24         update: yes
25
26     - name: install the nvidia driver using the playbook
7f902e 27       command: '/bin/ansible-playbook -i /etc/ansible/hosts -e hosts_to_apply="masters" -e cuda_driver_repo_base_url="http://admin.na.shared.opentlc.com/repos/nvidia/" -e cuda_driver_repo_suffix="cuda/" /root/openshift-psap/playbooks/nvidia-driver-install.yaml'
573ee0 28
EMJ 29     - name: install the nvidia runtime hook using the playbook
7f902e 30       command: '/bin/ansible-playbook -i /etc/ansible/hosts -e hosts_to_apply="masters" -e nvidia_container_hook_repo_base_url="http://admin.na.shared.opentlc.com/repos/nvidia/" -e libnvidia_container_suffix="libnvidia-container" -e libnvidia_container_repo_gpgcheck="false" -e nvidia_container_runtime_suffix="nvidia-container-runtime" -e nvidia_container_runtime_repo_gpgcheck="false" /root/openshift-psap/playbooks/nvidia-container-runtime-hook.yaml'
573ee0 31
EMJ 32     - name: install the nvidia device plugin using the playbook
33       command: '/bin/ansible-playbook -i /etc/ansible/hosts -e hosts_to_apply="masters" -e gpu_hosts="nodes" /root/openshift-psap/playbooks/nvidia-device-plugin.yaml'
34
35 - name: Pre-flight GPU checks
36   hosts: bastions
37   become: true
38   gather_facts: false
39   vars_files:
40     - "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_vars.yml"
41     - "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_secret_vars.yml"
42   tags:
43     - step005.2
44     - gpu_preflight
45   tasks:
46     # if we do something multi-node this will have to change to look for the total number of gpu nodes
47     - name: Check that the daemonset is ready
ecfb11 48       command: oc get daemonset -n kube-system nvidia-deviceplugin-daemonset -o jsonpath --template='{.status.numberReady}'
573ee0 49       register: daemonset_ready_out
EMJ 50       until: 'daemonset_ready_out.stdout | int >= 1'
51       retries: 5
52       delay: 60
53
54     # if we do something multi-node this will have to change
55     - name: Check that the node reports capacity
56       command: oc get node {{ item }} -o jsonpath --template='{.status.capacity.nvidia\.com/gpu}' 
57       register: node_gpu_capacity_out
58       failed_when: '"1" not in node_gpu_capacity_out.stdout'
59       with_items: '{{ groups["masters"] }}'
60
61     - name: Delete cuda vector pod in case it was already created
62       command: oc delete pod cuda-vector-add -n nvidia-device-plugin
63       ignore_errors: true
64
65     - name: Deploy the cuda vector pod
66       shell: 'cat /root/openshift-psap/blog/gpu/device-plugin/cuda-vector-add.yaml | grep -v "namespace: nvidia" | oc create -n nvidia-device-plugin -f -'
67
68     - name: Wait for pod completion
69       command: oc get pod -n nvidia-device-plugin cuda-vector-add -o jsonpath --template='{.status.phase}'
70       register: cuda_status_out
71       until: '"Succeeded" in cuda_status_out.stdout'
72       retries: 5
73       delay: 60
74     
75     - name: Double check cuda pod success
76       command: oc logs -n nvidia-device-plugin cuda-vector-add
77       register: cuda_pod_out
78       failed_when: '"Test PASSED" not in cuda_pod_out.stdout'
79
80 - name: User administration
81   hosts: bastions
82   become: true
83   gather_facts: false
84   vars_files:
85     - "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_vars.yml"
86     - "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_secret_vars.yml"
87   tags:
88     - step005.3
89     - user_admin
90   tasks:
91     - name: Give the gpu-user cluster admin privileges
92       command: oc adm policy add-cluster-role-to-user cluster-admin gpu-user
93
b7fc0f 94 - name: PostSoftware flight-check
EMJ 95   hosts: localhost
96   connection: local
97   gather_facts: false
98   become: false
99   vars_files:
100   - "{{ ANSIBLE_REPO_PATH }}/configs/{{ env_type }}/env_vars.yml"
101   tags:
102   - post_flight_check
103   tasks:
104   - debug:
105       msg: "Post-Software checks completed successfully"