Difference between revisions of "Bright:Add GPUs to OpenStack"
Jump to navigation
Jump to search
| Line 154: | Line 154: | ||
[root@gpu ~]# | [root@gpu ~]# | ||
| + | </syntaxhighlight> | ||
| + | |||
| + | == Setup Flavor with GPU == | ||
| + | <syntaxhighlight> | ||
| + | openstack flavor create --public --ram 2048 --disk 20 --vcpus 2 m1.small.1xgpu | ||
| + | # jesus this command killed me! (very little online, should be doable with openstack cmd but whatever version bright were using wasnt supported, nova flavor-list also failed) | ||
| + | [root@shadow-head ~]# nova-manage flavor set_key --name m1.small.1xgpu --key "pci_passthrough:alias" --value "K80_Telsa:1" | ||
| + | Key pci_passthrough:alias set to K80_Telsa:1 on instance type m1.small.1xgpu | ||
| + | # check the flavors | ||
| + | [root@shadow-head ~]# nova-manage flavor list | ||
| + | m1.medium: Memory: 4096MB, VCPUS: 2, Root: 40GB, Ephemeral: 0Gb, FlavorID: 3, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {} | ||
| + | m1.tiny: Memory: 512MB, VCPUS: 1, Root: 1GB, Ephemeral: 0Gb, FlavorID: 1, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {} | ||
| + | m1.large: Memory: 8192MB, VCPUS: 4, Root: 80GB, Ephemeral: 0Gb, FlavorID: 4, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {} | ||
| + | m1.xlarge: Memory: 16384MB, VCPUS: 8, Root: 160GB, Ephemeral: 0Gb, FlavorID: 5, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {} | ||
| + | m1.small: Memory: 2048MB, VCPUS: 1, Root: 20GB, Ephemeral: 0Gb, FlavorID: 2, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {} | ||
| + | b1.small: Memory: 1024MB, VCPUS: 1, Root: 25GB, Ephemeral: 0Gb, FlavorID: 5496005e-f3a1-48fb-b893-40a13b2ff008, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {} | ||
| + | m1.xsmall: Memory: 1024MB, VCPUS: 1, Root: 10GB, Ephemeral: 10Gb, FlavorID: 5e5549cf-94d8-4069-843c-ae37b7706d06, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {} | ||
| + | hackathon: Memory: 2048MB, VCPUS: 2, Root: 100GB, Ephemeral: 0Gb, FlavorID: 41371240-c1a3-4df4-bf4a-dad152423f7e, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {} | ||
| + | m1.large-plus: Memory: 16384MB, VCPUS: 4, Root: 80GB, Ephemeral: 0Gb, FlavorID: 103f37bf-949b-4a45-953e-d7ead651aeba, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {} | ||
| + | m1.small.1xgpu: Memory: 2048MB, VCPUS: 2, Root: 20GB, Ephemeral: 0Gb, FlavorID: 409aafa7-ee82-4c64-83b0-d5fa2ca49288, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {u'pci_passthrough:alias': u'K80_Telsa:1'} | ||
</syntaxhighlight> | </syntaxhighlight> | ||
Revision as of 17:50, 24 March 2016
Performed on the centos 7 node with the 2x K80 GPUs (provisioned as a nova hypervisor under bright / openstack-image - default)
Ensure Intel IOMMU is enabled
Need to pass intel_iommu=on as a kernel arg at boot
[shadow-head]% softwareimage
[shadow-head->softwareimage]% use openstack-image
[shadow-head->softwareimage[openstack-image]]% list
Name (key) Path Kernel version
-------------------- ---------------------------------------- ----------------------
default-image /cm/images/default-image 3.10.0-229.el7.x86_64
openstack-image /cm/images/openstack-image 3.10.0-229.el7.x86_64
test-image /cm/images/test-image 3.10.0-229.el7.x86_64
[shadow-head->softwareimage[openstack-image]]% show
Parameter Value
-------------------------------- ------------------------------------------------
Boot FSPart 98784247814
Creation time Tue, 29 Sep 2015 10:29:56 GMT
Enable SOL yes
FSPart 98784247814
Kernel modules <38 in submode>
Kernel parameters rdblacklist=nouveau
Kernel version 3.10.0-229.el7.x86_64
Locked no
Name openstack-image
Notes <0 bytes>
Path /cm/images/openstack-image
Revision
SOL Flow Control no
SOL Port ttyS1
SOL Speed 115200
[shadow-head->softwareimage[openstack-image]]% set kernelparameters "rdblacklist=nouveau intel_iommu=on"
[shadow-head->softwareimage*[openstack-image*]]% show
Parameter Value
-------------------------------- ------------------------------------------------
Boot FSPart 98784247814
Creation time Tue, 29 Sep 2015 10:29:56 GMT
Enable SOL yes
FSPart 98784247814
Kernel modules <38 in submode>
Kernel parameters rdblacklist=nouveau intel_iommu=on
Kernel version 3.10.0-229.el7.x86_64
Locked no
Name openstack-image
Notes <0 bytes>
Path /cm/images/openstack-image
Revision
SOL Flow Control no
SOL Port ttyS1
SOL Speed 115200
[shadow-head->softwareimage*[openstack-image*]]% commit
=============================== openstack-image ================================
Field Message
------------------------ --------------------------------------------------------------
module Warning: Module xhci-hcd does not exist for specified kernel.
[shadow-head->softwareimage[openstack-image]]%Now when the node is booted up, check the dmesg output to confirm IOMMU is enabled;
[root@gpu ~]# dmesg | grep -iE "dmar|iommu" | grep -i enabled
[ 0.000000] Intel-IOMMU: enabledGet the PCI IDs of the GPUs
In this instance we are using K80s
# check for the nvidia GPUs
[root@gpu ~]# lspci | grep -i nvidia
04:00.0 3D controller: NVIDIA Corporation GK210GL [Tesla K80] (rev a1)
05:00.0 3D controller: NVIDIA Corporation GK210GL [Tesla K80] (rev a1)
# get their IDs with the -n flag
[root@gpu ~]# lspci -nn | grep -i nvidia
04:00.0 3D controller [0302]: NVIDIA Corporation GK210GL [Tesla K80] [10de:102d] (rev a1)
05:00.0 3D controller [0302]: NVIDIA Corporation GK210GL [Tesla K80] [10de:102d] (rev a1)- So the vendor id is: 10de
- And the product id is: 102d
Update the /etc/nova/nova.conf file
Applied to the headnode, this will sync with other nodes (all nodes will run same nova.conf version)
Needed to update some things through cmsh as the files kept getting overwrote.
[shadow-head->openstack[default]->settings:compute]% set schedulerfilters "RetryFilter AvailabilityZoneFilter RamFilter ComputeFilter ComputeCapabilitiesFilter ImagePropertiesFilter ServerGroupAntiAffinityFilter ServerGroupAffinityFilter PciPassthroughFilter"
[shadow-head->openstack*[default*]->settings:compute*]% show
Parameter Value
----------------------------------- ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
CPU Allocation ratio 16
Disk Allocation ratio 1
Live migration enabled yes
Live migration retries 2
Max attempts to schedule instance 5
Max instances per host 50
RAM Allocation ratio 1.5
Reserved host memory 512MB
Revision
Scheduler filters RetryFilter AvailabilityZoneFilter RamFilter ComputeFilter ComputeCapabilitiesFilter ImagePropertiesFilter ServerGroupAntiAffinityFilter ServerGroupAffinityFilter PciPassthroughFilter
Default availability zone default
VNC proxy hostname 172.28.0.199
[shadow-head->openstack*[default*]->settings:compute*]% commit
=================================== default ====================================
Field Message
------------------------ -------------------------------------------------------------
settingscredentials/main Warning: Value is too short (expected at least 8 characters)
adminpassword
[shadow-head->openstack[default]->settings:compute]%
Thu Mar 24 15:40:52 2016 [notice] shadow-head: Service openstack-nova-api was restarted
[shadow-head->openstack[default]->settings:compute]%
Thu Mar 24 15:40:54 2016 [notice] shadow-head: Service openstack-nova-conductor was restarted
[shadow-head->openstack[default]->settings:compute]%
Thu Mar 24 15:40:56 2016 [notice] shadow-head: Service openstack-nova-consoleauth was restarted
[shadow-head->openstack[default]->settings:compute]%
Thu Mar 24 15:40:58 2016 [notice] shadow-head: Service openstack-nova-metadata-api was restarted
[shadow-head->openstack[default]->settings:compute]%
Thu Mar 24 15:40:59 2016 [notice] shadow-head: Service openstack-nova-novncproxy was restarted
[shadow-head->openstack[default]->settings:compute]%
Thu Mar 24 15:41:01 2016 [notice] shadow-head: Service openstack-nova-scheduler was restartedEventually, we should end up with;
# PCI alias and whitelist
[root@shadow-head nova]# grep ^pci_ /etc/nova/nova.conf
pci_alias = { 'name': 'K80_Tesla', 'vendor_id': '10de', 'product_id': '102d' }
pci_passthrough_whitelist = [{ "vendor_id": "10de", "product_id": "102d" }]
# Scheduler params
[root@shadow-head nova]# grep ^scheduler /etc/nova/nova.conf
scheduler_max_attempts=5
scheduler_default_filters=RetryFilter,AvailabilityZoneFilter,RamFilter,ComputeFilter,ComputeCapabilitiesFilter,ImagePropertiesFilter,ServerGroupAntiAffinityFilter,ServerGroupAffinityFilter,PciPassthroughFilter
scheduler_available_filters=nova.scheduler.filters.all_filters
scheduler_available_filters=nova.scheduler.filters.pci_passthrough_filter.PciPassthroughFilter
scheduler_driver=nova.scheduler.filter_scheduler.FilterSchedulerRestart the services
[root@shadow-head ~]# systemctl restart openstack-nova-api
[root@shadow-head ~]# systemctl restart openstack-nova-conductor
Job for openstack-nova-conductor.service canceled.
[root@shadow-head ~]# systemctl restart openstack-nova-consoleauth
[root@shadow-head ~]# systemctl restart openstack-nova-metadata-api
[root@shadow-head ~]# systemctl restart openstack-nova-novncproxy
[root@shadow-head ~]# systemctl restart openstack-nova-scheduler
# copy to node in question (presume bright will sync it eventually anyway)
[root@shadow-head ~]# scp /etc/nova/nova.conf gpu:/etc/nova/nova.conf
nova.conf 100% 109KB 108.9KB/s 00:00
[root@shadow-head ~]# ssh gpu
[root@gpu ~]# systemctl restart openstack-nova-compute
[root@gpu ~]#Setup Flavor with GPU
openstack flavor create --public --ram 2048 --disk 20 --vcpus 2 m1.small.1xgpu
# jesus this command killed me! (very little online, should be doable with openstack cmd but whatever version bright were using wasnt supported, nova flavor-list also failed)
[root@shadow-head ~]# nova-manage flavor set_key --name m1.small.1xgpu --key "pci_passthrough:alias" --value "K80_Telsa:1"
Key pci_passthrough:alias set to K80_Telsa:1 on instance type m1.small.1xgpu
# check the flavors
[root@shadow-head ~]# nova-manage flavor list
m1.medium: Memory: 4096MB, VCPUS: 2, Root: 40GB, Ephemeral: 0Gb, FlavorID: 3, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {}
m1.tiny: Memory: 512MB, VCPUS: 1, Root: 1GB, Ephemeral: 0Gb, FlavorID: 1, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {}
m1.large: Memory: 8192MB, VCPUS: 4, Root: 80GB, Ephemeral: 0Gb, FlavorID: 4, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {}
m1.xlarge: Memory: 16384MB, VCPUS: 8, Root: 160GB, Ephemeral: 0Gb, FlavorID: 5, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {}
m1.small: Memory: 2048MB, VCPUS: 1, Root: 20GB, Ephemeral: 0Gb, FlavorID: 2, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {}
b1.small: Memory: 1024MB, VCPUS: 1, Root: 25GB, Ephemeral: 0Gb, FlavorID: 5496005e-f3a1-48fb-b893-40a13b2ff008, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {}
m1.xsmall: Memory: 1024MB, VCPUS: 1, Root: 10GB, Ephemeral: 10Gb, FlavorID: 5e5549cf-94d8-4069-843c-ae37b7706d06, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {}
hackathon: Memory: 2048MB, VCPUS: 2, Root: 100GB, Ephemeral: 0Gb, FlavorID: 41371240-c1a3-4df4-bf4a-dad152423f7e, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {}
m1.large-plus: Memory: 16384MB, VCPUS: 4, Root: 80GB, Ephemeral: 0Gb, FlavorID: 103f37bf-949b-4a45-953e-d7ead651aeba, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {}
m1.small.1xgpu: Memory: 2048MB, VCPUS: 2, Root: 20GB, Ephemeral: 0Gb, FlavorID: 409aafa7-ee82-4c64-83b0-d5fa2ca49288, Swap: 0MB, RXTX Factor: 1.0, public, ExtraSpecs {u'pci_passthrough:alias': u'K80_Telsa:1'}