Problem
From within the kubeflow jupyter notebook, the connection to the kubeflow pipeline fails, although followed the Connect to Kubeflow Pipelines from the same cluster - Multi-user mode.
import os
import kfp
with open(os.environ['KF_PIPELINES_SA_TOKEN_PATH'], "r") as f:
TOKEN = f.read()
client = kfp.Client(
existing_token=TOKEN
)
print(client.list_pipelines())
---------------------------------------------------------------------------
ConnectionRefusedError Traceback (most recent call last)
/opt/conda/lib/python3.8/site-packages/urllib3/connection.py in _new_conn(self)
168 try:
--> 169 conn = connection.create_connection(
170 (self._dns_host, self.port), self.timeout, **extra_kw
/opt/conda/lib/python3.8/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
95 if err is not None:
---> 96 raise err
97
/opt/conda/lib/python3.8/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
85 sock.bind(source_address)
---> 86 sock.connect(sa)
87 return sock
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
/opt/conda/lib/python3.8/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
698 # Make the request on the httplib connection object.
--> 699 httplib_response = self._make_request(
700 conn,
/opt/conda/lib/python3.8/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
393 else:
--> 394 conn.request(method, url, **httplib_request_kw)
395
/opt/conda/lib/python3.8/site-packages/urllib3/connection.py in request(self, method, url, body, headers)
233 headers["User-Agent"] = _get_default_user_agent()
--> 234 super(HTTPConnection, self).request(method, url, body=body, headers=headers)
235
/opt/conda/lib/python3.8/http/client.py in request(self, method, url, body, headers, encode_chunked)
1251 """Send a complete request to the server."""
-> 1252 self._send_request(method, url, body, headers, encode_chunked)
1253
/opt/conda/lib/python3.8/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
1297 body = _encode(body, 'body')
-> 1298 self.endheaders(body, encode_chunked=encode_chunked)
1299
/opt/conda/lib/python3.8/http/client.py in endheaders(self, message_body, encode_chunked)
1246 raise CannotSendHeader()
-> 1247 self._send_output(message_body, encode_chunked=encode_chunked)
1248
/opt/conda/lib/python3.8/http/client.py in _send_output(self, message_body, encode_chunked)
1006 del self._buffer[:]
-> 1007 self.send(msg)
1008
/opt/conda/lib/python3.8/http/client.py in send(self, data)
946 if self.auto_open:
--> 947 self.connect()
948 else:
/opt/conda/lib/python3.8/site-packages/urllib3/connection.py in connect(self)
199 def connect(self):
--> 200 conn = self._new_conn()
201 self._prepare_conn(conn)
/opt/conda/lib/python3.8/site-packages/urllib3/connection.py in _new_conn(self)
180 except SocketError as e:
--> 181 raise NewConnectionError(
182 self, "Failed to establish a new connection: %s" % e
NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7f5b1ac2e2b0>: Failed to establish a new connection: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
<ipython-input-26-245cf5dc3b72> in <module>
3 existing_token=TOKEN
4 )
----> 5 print(client.list_pipelines())
/opt/conda/lib/python3.8/site-packages/kfp/_client.py in list_pipelines(self, page_token, page_size, sort_by)
543 A response object including a list of pipelines and next page token.
544 """
--> 545 return self._pipelines_api.list_pipelines(page_token=page_token, page_size=page_size, sort_by=sort_by)
546
547 def list_pipeline_versions(self, pipeline_id: str, page_token='', page_size=10, sort_by=''):
/opt/conda/lib/python3.8/site-packages/kfp_server_api/api/pipeline_service_api.py in list_pipelines(self, **kwargs)
1210 """
1211 kwargs['_return_http_data_only'] = True
-> 1212 return self.list_pipelines_with_http_info(**kwargs) # noqa: E501
1213
1214 def list_pipelines_with_http_info(self, **kwargs): # noqa: E501
/opt/conda/lib/python3.8/site-packages/kfp_server_api/api/pipeline_service_api.py in list_pipelines_with_http_info(self, **kwargs)
1311 auth_settings = ['Bearer'] # noqa: E501
1312
-> 1313 return self.api_client.call_api(
1314 '/apis/v1beta1/pipelines', 'GET',
1315 path_params,
/opt/conda/lib/python3.8/site-packages/kfp_server_api/api_client.py in call_api(self, resource_path, method, path_params, query_params, header_params, body, post_params, files, response_type, auth_settings, async_req, _return_http_data_only, collection_formats, _preload_content, _request_timeout, _host)
362 """
363 if not async_req:
--> 364 return self.__call_api(resource_path, method,
365 path_params, query_params, header_params,
366 body, post_params, files,
/opt/conda/lib/python3.8/site-packages/kfp_server_api/api_client.py in __call_api(self, resource_path, method, path_params, query_params, header_params, body, post_params, files, response_type, auth_settings, _return_http_data_only, collection_formats, _preload_content, _request_timeout, _host)
179 try:
180 # perform request and return response
--> 181 response_data = self.request(
182 method, url, query_params=query_params, headers=header_params,
183 post_params=post_params, body=body,
/opt/conda/lib/python3.8/site-packages/kfp_server_api/api_client.py in request(self, method, url, query_params, headers, post_params, body, _preload_content, _request_timeout)
387 """Makes the HTTP request using RESTClient."""
388 if method == "GET":
--> 389 return self.rest_client.GET(url,
390 query_params=query_params,
391 _preload_content=_preload_content,
/opt/conda/lib/python3.8/site-packages/kfp_server_api/rest.py in GET(self, url, headers, query_params, _preload_content, _request_timeout)
228 def GET(self, url, headers=None, query_params=None, _preload_content=True,
229 _request_timeout=None):
--> 230 return self.request("GET", url,
231 headers=headers,
232 _preload_content=_preload_content,
/opt/conda/lib/python3.8/site-packages/kfp_server_api/rest.py in request(self, method, url, query_params, headers, body, post_params, _preload_content, _request_timeout)
206 # For `GET`, `HEAD`
207 else:
--> 208 r = self.pool_manager.request(method, url,
209 fields=query_params,
210 preload_content=_preload_content,
/opt/conda/lib/python3.8/site-packages/urllib3/request.py in request(self, method, url, fields, headers, **urlopen_kw)
72
73 if method in self._encode_url_methods:
---> 74 return self.request_encode_url(
75 method, url, fields=fields, headers=headers, **urlopen_kw
76 )
/opt/conda/lib/python3.8/site-packages/urllib3/request.py in request_encode_url(self, method, url, fields, headers, **urlopen_kw)
94 url += "?" + urlencode(fields)
95
---> 96 return self.urlopen(method, url, **extra_kw)
97
98 def request_encode_body(
/opt/conda/lib/python3.8/site-packages/urllib3/poolmanager.py in urlopen(self, method, url, redirect, **kw)
373 response = conn.urlopen(method, url, **kw)
374 else:
--> 375 response = conn.urlopen(method, u.request_uri, **kw)
376
377 redirect_location = redirect and response.get_redirect_location()
/opt/conda/lib/python3.8/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
781 "Retrying (%r) after connection broken by '%r': %s", retries, err, url
782 )
--> 783 return self.urlopen(
784 method,
785 url,
/opt/conda/lib/python3.8/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
781 "Retrying (%r) after connection broken by '%r': %s", retries, err, url
782 )
--> 783 return self.urlopen(
784 method,
785 url,
/opt/conda/lib/python3.8/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
781 "Retrying (%r) after connection broken by '%r': %s", retries, err, url
782 )
--> 783 return self.urlopen(
784 method,
785 url,
/opt/conda/lib/python3.8/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
753 e = ProtocolError("Connection aborted.", e)
754
--> 755 retries = retries.increment(
756 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
757 )
/opt/conda/lib/python3.8/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
572
573 if new_retry.is_exhausted():
--> 574 raise MaxRetryError(_pool, url, error or ResponseError(cause))
575
576 log.debug("Incremented Retry for (url='%s'): %r", url, new_retry)
MaxRetryError: HTTPConnectionPool(host='localhost', port=80): Max retries exceeded with url: /apis/v1beta1/pipelines?page_token=&page_size=10&sort_by= (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f5b1ac2e2b0>: Failed to establish a new connection: [Errno 111] Connection refused'))
Related
The issue is discussed in the github issue below but no clear answer.
[Multi User] failed to call kfp.Client().create_run_from_pipeline_func in in-cluster juypter notebook #4440
import os
with open(os.environ['KF_PIPELINES_SA_TOKEN_PATH'], "r") as f:
TOKEN = f.read()
import kfp
client = kfp.Client(
host='http://ml-pipeline.kubeflow.svc.cluster.local:8888',
# host='http://ml-pipeline-ui.kubeflow.svc.cluster.local:80', # <--- Does not work as later causes HTTP response body: RBAC: access denied
# existing_token=TOKEN. # Not required
)
print(client.list_pipelines())
Result
{'next_page_token': None,
'pipelines': [{'created_at': datetime.datetime(2022, 5, 22, 2, 5, 33, tzinfo=tzlocal()),
'default_version': {'code_source_url': None,
'created_at': datetime.datetime(2022, 5, 22, 2, 5, 33, tzinfo=tzlocal()),
'id': 'b693a0d3-b11c-4c5b-b3f9-6158382948d6',
'name': '[Demo] XGBoost - Iterative model '
'training',
'package_url': None,
'parameters': None,
'resource_references': [{'key': {'id': 'b693a0d3-b11c-4c5b-b3f9-6158382948d6',
'type': 'PIPELINE'},
'name': None,
'relationship': 'OWNER'}]},
'description': '[source '
'code](https://github.com/kubeflow/pipelines/blob/c8a18bde299f2fdf5f72144f15887915b8d11520/samples/core/train_until_good/train_until_good.py) '
'This sample demonstrates iterative training '
'using a train-eval-check recursive loop. The '
'main pipeline trains the initial model and '
'then gradually trains the model some more '
'until the model evaluation metrics are good '
'enough.',
'error': None,
'id': 'b693a0d3-b11c-4c5b-b3f9-6158382948d6',
'name': '[Demo] XGBoost - Iterative model training',
'parameters': None,
'resource_references': None,
'url': None},
{'created_at': datetime.datetime(2022, 5, 22, 2, 5, 34, tzinfo=tzlocal()),
'default_version': {'code_source_url': None,
'created_at': datetime.datetime(2022, 5, 22, 2, 5, 34, tzinfo=tzlocal()),
'id': 'c65b4f2e-362d-41a8-8f5c-9b944830029e',
'name': '[Demo] TFX - Taxi tip prediction '
'model trainer',
'package_url': None,
'parameters': [{'name': 'pipeline-root',
'value': 'gs://{{kfp-default-bucket}}/tfx_taxi_simple/{{workflow.uid}}'},
{'name': 'module-file',
'value': '/opt/conda/lib/python3.7/site-packages/tfx/examples/chicago_taxi_pipeline/taxi_utils_native_keras.py'},
{'name': 'push_destination',
'value': '{"filesystem": '
'{"base_directory": '
'"gs://your-bucket/serving_model/tfx_taxi_simple"}}'}],
'resource_references': [{'key': {'id': 'c65b4f2e-362d-41a8-8f5c-9b944830029e',
'type': 'PIPELINE'},
'name': None,
'relationship': 'OWNER'}]},
'description': '[source '
'code](https://github.com/kubeflow/pipelines/tree/c8a18bde299f2fdf5f72144f15887915b8d11520/samples/core/parameterized_tfx_oss) '
'[GCP Permission '
'requirements](https://github.com/kubeflow/pipelines/blob/c8a18bde299f2fdf5f72144f15887915b8d11520/samples/core/parameterized_tfx_oss#permission). '
'Example pipeline that does classification with '
'model analysis based on a public tax cab '
'dataset.',
'error': None,
'id': 'c65b4f2e-362d-41a8-8f5c-9b944830029e',
'name': '[Demo] TFX - Taxi tip prediction model trainer',
'parameters': [{'name': 'pipeline-root',
'value': 'gs://{{kfp-default-bucket}}/tfx_taxi_simple/{{workflow.uid}}'},
{'name': 'module-file',
'value': '/opt/conda/lib/python3.7/site-packages/tfx/examples/chicago_taxi_pipeline/taxi_utils_native_keras.py'},
{'name': 'push_destination',
'value': '{"filesystem": {"base_directory": '
'"gs://your-bucket/serving_model/tfx_taxi_simple"}}'}],
'resource_references': None,
'url': None},
{'created_at': datetime.datetime(2022, 5, 22, 2, 5, 35, tzinfo=tzlocal()),
'default_version': {'code_source_url': None,
'created_at': datetime.datetime(2022, 5, 22, 2, 5, 35, tzinfo=tzlocal()),
'id': '56bb7063-ade0-4074-9721-b063f42c46fd',
'name': '[Tutorial] Data passing in python '
'components',
'package_url': None,
'parameters': None,
'resource_references': [{'key': {'id': '56bb7063-ade0-4074-9721-b063f42c46fd',
'type': 'PIPELINE'},
'name': None,
'relationship': 'OWNER'}]},
'description': '[source '
'code](https://github.com/kubeflow/pipelines/tree/c8a18bde299f2fdf5f72144f15887915b8d11520/samples/tutorials/Data%20passing%20in%20python%20components) '
'Shows how to pass data between python '
'components.',
'error': None,
'id': '56bb7063-ade0-4074-9721-b063f42c46fd',
'name': '[Tutorial] Data passing in python components',
'parameters': None,
'resource_references': None,
'url': None},
{'created_at': datetime.datetime(2022, 5, 22, 2, 5, 36, tzinfo=tzlocal()),
'default_version': {'code_source_url': None,
'created_at': datetime.datetime(2022, 5, 22, 2, 5, 36, tzinfo=tzlocal()),
'id': '36b09aa0-a317-4ad4-a0ed-ddf55a485eb0',
'name': '[Tutorial] DSL - Control '
'structures',
'package_url': None,
'parameters': None,
'resource_references': [{'key': {'id': '36b09aa0-a317-4ad4-a0ed-ddf55a485eb0',
'type': 'PIPELINE'},
'name': None,
'relationship': 'OWNER'}]},
'description': '[source '
'code](https://github.com/kubeflow/pipelines/tree/c8a18bde299f2fdf5f72144f15887915b8d11520/samples/tutorials/DSL%20-%20Control%20structures) '
'Shows how to use conditional execution and '
'exit handlers. This pipeline will randomly '
'fail to demonstrate that the exit handler gets '
'executed even in case of failure.',
'error': None,
'id': '36b09aa0-a317-4ad4-a0ed-ddf55a485eb0',
'name': '[Tutorial] DSL - Control structures',
'parameters': None,
'resource_references': None,
'url': None},
{'created_at': datetime.datetime(2022, 5, 24, 6, 46, 45, tzinfo=tzlocal()),
'default_version': {'code_source_url': None,
'created_at': datetime.datetime(2022, 5, 24, 6, 46, 45, tzinfo=tzlocal()),
'id': 'da2bc8b4-27f2-4aa3-befb-c53487d9db49',
'name': 'test',
'package_url': None,
'parameters': [{'name': 'a', 'value': '1'},
{'name': 'b', 'value': '7'}],
'resource_references': [{'key': {'id': 'da2bc8b4-27f2-4aa3-befb-c53487d9db49',
'type': 'PIPELINE'},
'name': None,
'relationship': 'OWNER'}]},
'description': 'test',
'error': None,
'id': 'da2bc8b4-27f2-4aa3-befb-c53487d9db49',
'name': 'test',
'parameters': [{'name': 'a', 'value': '1'},
{'name': 'b', 'value': '7'}],
'resource_references': None,
'url': None}],
'total_size': 5}
I read similar question but not found an answer.
I have this data in input:
{ "connectionHistory": [
{
"endTime": 1585571806,
"bytesSent": 31588,
"startTime": 1585571453,
"duration": 353,
"bytesReceived": 68711,
"virtualIpAddress": "10.20.1.102",
"remoteIpAddress": "172.16.15.183"
},
{
"endTime": 1585591333,
"bytesSent": 21927,
"startTime": 1585591095,
"duration": 238,
"bytesReceived": 51041,
"virtualIpAddress": "10.20.1.102",
"remoteIpAddress": "172.16.13.75"
},
{
"endTime": 1585592547,
"bytesSent": 4630423,
"startTime": 1585591333,
"duration": 1214,
"bytesReceived": 678052,
"virtualIpAddress": "10.20.1.102",
"remoteIpAddress": "172.16.13.75"
},
{
"endTime": 1585743727,
"bytesSent": 2153310,
"startTime": 1585743512,
"duration": 215,
"bytesReceived": 499382,
"virtualIpAddress": "10.20.1.102",
"remoteIpAddress": "172.16.12.209"
}
]}
And like to have in output something like:
Start End Duration IP client IP remote Received Sent
01 Apr 2020, 16:13 01 Apr 2020, 16:15 02m 11s 10.20.1.102 5.170.193.103 475.15 K 2.01 M
01 Apr 2020, 14:18 01 Apr 2020, 14:22 03m 35s 10.20.1.102 5.170.192.209 487.68 K 2.05 M
30 Mar 2020, 20:02 30 Mar 2020, 20:22 20m 14s 10.20.1.102 5.170.193.75 662.16 K 4.42 M
30 Mar 2020, 19:58 30 Mar 2020, 20:02 03m 58s 10.20.1.102 5.170.193.75 49.84 K 21.41 K
30 Mar 2020, 14:30 30 Mar 2020, 14:36 05m 53s 10.20.1.102 5.170.195.183 67.1 K 30.85 K
Tried to play with jq, but with no good results ...
Hints appreciated ;-)
Thanks, P.
I did some attempts and search on the net. Found code that help (jq: Object cannot be csv-formatted, only array).
Created file json2csv.jq containing:
def json2headers:
def isscalar: type | . != "array" and . != "object";
def isflat: all(.[]; isscalar);
paths as $p
| getpath($p)
| if type == "array" and isflat then $p
elif isscalar and (($p[-1]|type) == "string") then $p
else empty end ;
def json2array($header):
def value($p):
try getpath($p) catch null
| if type == "object" then null else . end;
[$header[] as $p | value($p)];
def json2csv:
( [.[] | json2headers] | unique) as $h
| ([$h[]|join("_") ],
(.[]
| json2array($h)
| map( if type == "array" then map(tostring)|join("|") else tostring end)))
| #csv ;
Call it using:
jq -r -L. 'include "json2csv"; json2csv' connAAA.json
I got:
"bytesReceived","bytesSent","duration","endTime","remoteIpAddress","startTime","virtualIpAddress"
"9510","4657","81","1585511362","192.168.101.91","1585511281","10.20.1.6"
"48586","52696","1956","1585514599","192.168.101.91","1585512643","10.20.1.6"
"11829","7399","153","1585514835","192.168.101.91","1585514682","10.20.1.6"
"13871","10318","330","1585518156","192.168.101.91","1585517826","10.20.1.6"
If I use #tsv instead of #csv I got:
bytesReceived bytesSent duration endTime remoteIpAddress startTime virtualIpAddress
9510 4657 81 1585511362 192.168.101.91 1585511281 10.20.1.6
48586 52696 1956 1585514599 192.168.101.91 1585512643 10.20.1.6
11829 7399 153 1585514835 192.168.101.91 1585514682 10.20.1.6
13871 10318 330 1585518156 192.168.101.91 1585517826 10.20.1.6
That is near my desidered result.
Now (before probably), I need to convert Unix Timestamp to DateTime.
I suppose using todateiso8601 function, but can't insert it correctly.
Suppose this isn't difficult for a jq skilled guy ;-)
Thanks, P.
When I create the instance in the dashboard, I get error:
No valid host was found. There are not enough hosts available.
In the /var/log/nova/nova-conductor.log file, there is the log:
2017-08-05 00:22:29.046 3834 WARNING nova.scheduler.utils [req-89c159c7-b40a-43eb-8f0d-9306eb73e83a 2a5fa182fb1b459980db09cd1572850e 0d5998f2f7ec4c4892a32e06bafb19df - - -] Failed to compute_task_build_instances: No valid host was found. There are not enough hosts available.
Traceback (most recent call last):
File "/usr/lib/python2.7/site-packages/oslo_messaging/rpc/server.py", line 199, in inner
return func(*args, **kwargs)
File "/usr/lib/python2.7/site-packages/nova/scheduler/manager.py", line 104, in select_destinations
dests = self.driver.select_destinations(ctxt, spec_obj)
File "/usr/lib/python2.7/site-packages/nova/scheduler/filter_scheduler.py", line 74, in select_destinations
raise exception.NoValidHost(reason=reason)
NoValidHost: No valid host was found. There are not enough hosts available.
2017-08-05 00:22:29.048 3834 WARNING nova.scheduler.utils [req-89c159c7-b40a-43eb-8f0d-9306eb73e83a 2a5fa182fb1b459980db09cd1572850e 0d5998f2f7ec4c4892a32e06bafb19df - - -] [instance: 2011e343-c8fc-4ed0-8148-b0d2b5ba37c3] Setting instance to ERROR state.
2017-08-05 00:22:30.785 3834 WARNING oslo_config.cfg [req-89c159c7-b40a-43eb-8f0d-9306eb73e83a 2a5fa182fb1b459980db09cd1572850e 0d5998f2f7ec4c4892a32e06bafb19df - - -] Option "auth_plugin" from group "neutron" is deprecated. Use option "auth_type" from group "neutron".
And I searched the SO, find a related post:Openstack-Devstack: Can't create instance, There are not enough hosts available
I checked the free_ram_mb in mysql:
MariaDB [nova]> select * from compute_nodes \G;
*************************** 1. row ***************************
created_at: 2017-08-04 12:44:26
updated_at: 2017-08-04 13:51:35
deleted_at: NULL
id: 4
service_id: NULL
vcpus: 8
memory_mb: 7808
local_gb: 19
vcpus_used: 0
memory_mb_used: 512
local_gb_used: 0
hypervisor_type: QEMU
hypervisor_version: 1005003
cpu_info: {"vendor": "Intel", "model": "Broadwell", "arch": "x86_64", "features": ["smap", "avx", "clflush", "sep", "rtm", "vme", "invpcid", "msr", "fsgsbase", "xsave", "pge", "erms", "hle", "cmov", "tsc", "smep", "pcid", "pat", "lm", "abm", "adx", "3dnowprefetch", "nx", "fxsr", "syscall", "sse4.1", "pae", "sse4.2", "pclmuldq", "fma", "tsc-deadline", "mmx", "osxsave", "cx8", "mce", "de", "rdtscp", "ht", "pse", "lahf_lm", "rdseed", "popcnt", "mca", "pdpe1gb", "apic", "sse", "f16c", "ds", "invtsc", "pni", "aes", "avx2", "sse2", "ss", "hypervisor", "bmi1", "bmi2", "ssse3", "fpu", "cx16", "pse36", "mtrr", "movbe", "rdrand", "x2apic"], "topology": {"cores": 2, "cells": 1, "threads": 1, "sockets": 4}}
disk_available_least: 18
free_ram_mb: 7296
free_disk_gb: 19
current_workload: 0
running_vms: 0
hypervisor_hostname: ha-node1
deleted: 0
host_ip: 192.168.8.101
supported_instances: [["i686", "qemu", "hvm"], ["x86_64", "qemu", "hvm"]]
pci_stats: {"nova_object.version": "1.1", "nova_object.changes": ["objects"], "nova_object.name": "PciDevicePoolList", "nova_object.data": {"objects": []}, "nova_object.namespace": "nova"}
metrics: []
extra_resources: NULL
stats: {}
numa_topology: NULL
host: ha-node1
ram_allocation_ratio: 3
cpu_allocation_ratio: 16
uuid: 9113940b-7ec9-462d-af06-6988dbb6b6cf
disk_allocation_ratio: 1
*************************** 2. row ***************************
created_at: 2017-08-04 12:44:34
updated_at: 2017-08-04 13:50:47
deleted_at: NULL
id: 6
service_id: NULL
vcpus: 8
memory_mb: 7808
local_gb: 19
vcpus_used: 0
memory_mb_used: 512
local_gb_used: 0
hypervisor_type: QEMU
hypervisor_version: 1005003
cpu_info: {"vendor": "Intel", "model": "Broadwell", "arch": "x86_64", "features": ["smap", "avx", "clflush", "sep", "rtm", "vme", "invpcid", "msr", "fsgsbase", "xsave", "pge", "erms", "hle", "cmov", "tsc", "smep", "pcid", "pat", "lm", "abm", "adx", "3dnowprefetch", "nx", "fxsr", "syscall", "sse4.1", "pae", "sse4.2", "pclmuldq", "fma", "tsc-deadline", "mmx", "osxsave", "cx8", "mce", "de", "rdtscp", "ht", "pse", "lahf_lm", "rdseed", "popcnt", "mca", "pdpe1gb", "apic", "sse", "f16c", "ds", "invtsc", "pni", "aes", "avx2", "sse2", "ss", "hypervisor", "bmi1", "bmi2", "ssse3", "fpu", "cx16", "pse36", "mtrr", "movbe", "rdrand", "x2apic"], "topology": {"cores": 2, "cells": 1, "threads": 1, "sockets": 4}}
disk_available_least: 18
free_ram_mb: 7296
free_disk_gb: 19
current_workload: 0
running_vms: 0
hypervisor_hostname: ha-node2
deleted: 0
host_ip: 192.168.8.102
supported_instances: [["i686", "qemu", "hvm"], ["x86_64", "qemu", "hvm"]]
pci_stats: {"nova_object.version": "1.1", "nova_object.changes": ["objects"], "nova_object.name": "PciDevicePoolList", "nova_object.data": {"objects": []}, "nova_object.namespace": "nova"}
metrics: []
extra_resources: NULL
stats: {}
numa_topology: NULL
host: ha-node2
ram_allocation_ratio: 3
cpu_allocation_ratio: 16
uuid: 32b574df-52ac-43dc-87f8-353350449076
disk_allocation_ratio: 1
2 rows in set (0.00 sec)
You see the free_ram_mb: 7296, I just want to create a 512mb VM, but failed.
EDIT-1
The nova services all are up:
[root#ha-node1 ~]# nova service-list
+----+------------------+----------+----------+---------+-------+----------------------------+-----------------+
| Id | Binary | Host | Zone | Status | State | Updated_at | Disabled Reason |
+----+------------------+----------+----------+---------+-------+----------------------------+-----------------+
| 2 | nova-consoleauth | ha-node3 | internal | enabled | up | 2017-08-05T14:20:25.000000 | - |
| 5 | nova-conductor | ha-node3 | internal | enabled | up | 2017-08-05T14:20:29.000000 | - |
| 7 | nova-cert | ha-node3 | internal | enabled | up | 2017-08-05T14:20:23.000000 | - |
| 15 | nova-scheduler | ha-node3 | internal | enabled | up | 2017-08-05T14:20:20.000000 | - |
| 22 | nova-cert | ha-node1 | internal | enabled | up | 2017-08-05T14:20:26.000000 | - |
| 29 | nova-conductor | ha-node1 | internal | enabled | up | 2017-08-05T14:20:22.000000 | - |
| 32 | nova-consoleauth | ha-node1 | internal | enabled | up | 2017-08-05T14:20:29.000000 | - |
| 33 | nova-consoleauth | ha-node2 | internal | enabled | up | 2017-08-05T14:20:29.000000 | - |
| 36 | nova-scheduler | ha-node1 | internal | enabled | up | 2017-08-05T14:20:30.000000 | - |
| 40 | nova-conductor | ha-node2 | internal | enabled | up | 2017-08-05T14:20:26.000000 | - |
| 44 | nova-cert | ha-node2 | internal | enabled | up | 2017-08-05T14:20:27.000000 | - |
| 46 | nova-scheduler | ha-node2 | internal | enabled | up | 2017-08-05T14:20:28.000000 | - |
| 49 | nova-compute | ha-node2 | nova | enabled | up | 2017-08-05T14:19:35.000000 | - |
| 53 | nova-compute | ha-node1 | nova | enabled | up | 2017-08-05T14:20:05.000000 | - |
+----+------------------+----------+----------+---------+-------+----------------------------+-----------------+
The nova list:
[root#ha-node1 ~]# nova list
+--------------------------------------+------+--------+------------+-------------+----------+
| ID | Name | Status | Task State | Power State | Networks |
+--------------------------------------+------+--------+------------+-------------+----------+
| 20193e58-2c5b-44c6-a98f-a44e2001934f | vm1 | ERROR | - | NOSTATE | |
And the nova show instance:
[root#ha-node1 ~]# nova show 20193e58-2c5b-44c6-a98f-a44e2001934f
+--------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Property | Value |
+--------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| OS-DCF:diskConfig | AUTO |
| OS-EXT-AZ:availability_zone | nova |
| OS-EXT-SRV-ATTR:host | - |
| OS-EXT-SRV-ATTR:hostname | vm1 |
| OS-EXT-SRV-ATTR:hypervisor_hostname | - |
| OS-EXT-SRV-ATTR:instance_name | instance-00000003 |
| OS-EXT-SRV-ATTR:kernel_id | |
| OS-EXT-SRV-ATTR:launch_index | 0 |
| OS-EXT-SRV-ATTR:ramdisk_id | |
| OS-EXT-SRV-ATTR:reservation_id | r-jct8kkcq |
| OS-EXT-SRV-ATTR:root_device_name | /dev/vda |
| OS-EXT-SRV-ATTR:user_data | - |
| OS-EXT-STS:power_state | 0 |
| OS-EXT-STS:task_state | - |
| OS-EXT-STS:vm_state | error |
| OS-SRV-USG:launched_at | - |
| OS-SRV-USG:terminated_at | - |
| accessIPv4 | |
| accessIPv6 | |
| config_drive | |
| created | 2017-08-05T14:17:54Z |
| description | vm1 |
| fault | {"message": "No valid host was found. There are not enough hosts available.", "code": 500, "details": " File \"/usr/lib/python2.7/site-packages/nova/conductor/manager.py\", line 496, in build_instances |
| | context, request_spec, filter_properties) |
| | File \"/usr/lib/python2.7/site-packages/nova/conductor/manager.py\", line 567, in _schedule_instances |
| | hosts = self.scheduler_client.select_destinations(context, spec_obj) |
| | File \"/usr/lib/python2.7/site-packages/nova/scheduler/utils.py\", line 370, in wrapped |
| | return func(*args, **kwargs) |
| | File \"/usr/lib/python2.7/site-packages/nova/scheduler/client/__init__.py\", line 51, in select_destinations |
| | return self.queryclient.select_destinations(context, spec_obj) |
| | File \"/usr/lib/python2.7/site-packages/nova/scheduler/client/__init__.py\", line 37, in __run_method |
| | return getattr(self.instance, __name)(*args, **kwargs) |
| | File \"/usr/lib/python2.7/site-packages/nova/scheduler/client/query.py\", line 32, in select_destinations |
| | return self.scheduler_rpcapi.select_destinations(context, spec_obj) |
| | File \"/usr/lib/python2.7/site-packages/nova/scheduler/rpcapi.py\", line 126, in select_destinations |
| | return cctxt.call(ctxt, 'select_destinations', **msg_args) |
| | File \"/usr/lib/python2.7/site-packages/oslo_messaging/rpc/client.py\", line 169, in call |
| | retry=self.retry) |
| | File \"/usr/lib/python2.7/site-packages/oslo_messaging/transport.py\", line 97, in _send |
| | timeout=timeout, retry=retry) |
| | File \"/usr/lib/python2.7/site-packages/oslo_messaging/_drivers/amqpdriver.py\", line 464, in send |
| | retry=retry) |
| | File \"/usr/lib/python2.7/site-packages/oslo_messaging/_drivers/amqpdriver.py\", line 455, in _send |
| | raise result |
| | ", "created": "2017-08-05T14:18:14Z"} |
| flavor | m1.tiny (1) |
| hostId | |
| host_status | |
| id | 20193e58-2c5b-44c6-a98f-a44e2001934f |
| image | cirros-0.3.4-x86_64 (202778cd-6b32-4486-9444-c167089d9082) |
| key_name | - |
| locked | False |
| metadata | {} |
| name | vm1 |
| os-extended-volumes:volumes_attached | [] |
| status | ERROR |
| tags | [] |
| tenant_id | 0d5998f2f7ec4c4892a32e06bafb19df |
| updated | 2017-08-05T14:18:16Z |
| user_id | 2a5fa182fb1b459980db09cd1572850e |
+--------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
EDIT-2
The nova-compute.log in the /var/log/nova/ useful information:
......
2017-08-05 22:17:42.669 103174 INFO nova.compute.resource_tracker [req-60a062ce-4b3d-4cb7-863e-2f9bba0bc6ec - - - - -] Compute_service record updated for ha-node1:ha-node1
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [req-7dded91f-7497-4d20-ba89-69f867a2a8fb 2a5fa182fb1b459980db09cd1572850e 0d5998f2f7ec4c4892a32e06bafb19df - - -] [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] Instance failed to spawn
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] Traceback (most recent call last):
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] File "/usr/lib/python2.7/site-packages/nova/compute/manager.py", line 2078, in _build_resources
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] yield resources
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] File "/usr/lib/python2.7/site-packages/nova/compute/manager.py", line 1920, in _build_and_run_instance
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] block_device_info=block_device_info)
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] File "/usr/lib/python2.7/site-packages/nova/virt/libvirt/driver.py", line 2584, in spawn
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] admin_pass=admin_password)
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] File "/usr/lib/python2.7/site-packages/nova/virt/libvirt/driver.py", line 2959, in _create_image
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] fileutils.ensure_tree(libvirt_utils.get_instance_path(instance))
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] File "/usr/lib/python2.7/site-packages/oslo_utils/fileutils.py", line 40, in ensure_tree
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] os.makedirs(path, mode)
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] File "/usr/lib64/python2.7/os.py", line 157, in makedirs
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] mkdir(name, mode)
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] OSError: [Errno 13] Permission denied: '/var/lib/nova/instances/20193e58-2c5b-44c6-a98f-a44e2001934f'
2017-08-05 22:18:03.357 103174 ERROR nova.compute.manager [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f]
2017-08-05 22:18:11.563 103174 INFO nova.compute.manager [req-7dded91f-7497-4d20-ba89-69f867a2a8fb 2a5fa182fb1b459980db09cd1572850e 0d5998f2f7ec4c4892a32e06bafb19df - - -] [instance: 20193e58-2c5b-44c6-a98f-a44e2001934f] Terminating instance
....
Debugging
Enable debug mode to get detailed logs.
Set debug = True in these files:
/etc/nova/nova.conf
/etc/nova/cinder.conf
/etc/glance/glance-registry.conf
Restart the reconfigured services
Try to create instance again and check logs.
Take a look at the nova-scheduler.log file and try to find line like this:
.. INFO nova.filters [req-..] Filter DiskFilter returned 0 hosts
Above this line should be DEBUG logs with Filters detailed information, for example:
.. DEBUG nova.filters [req-..] Filter RetryFilter returned 1 host(s) get_filtered_objects /usr/lib/python2.7/site-packages/nova/filters.py:104
.. DEBUG nova.filters [req-..] Filter AvailabilityZoneFilter returned 1 host(s) get_filtered_objects /usr/lib/python2.7/site-packages/nova/filters.py:104
.. DEBUG nova.filters [req-..] Filter RamFilter returned 1 host(s) get_filtered_objects /usr/lib/python2.7/site-packages/nova/filters.py:104
.. DEBUG nova.filters [req-..] (...) ram: 37107MB disk: 11264MB io_ops: 0 instances: 4 does not have 17408 MB usable disk, it only has 11264.0 MB usable disk. host_passes /usr/lib/python2.7/site-packages/nova/scheduler/filters/disk_filter.py:70
Overcommitting
OpenStack allows you to overcommit CPU and RAM on compute nodes. This allows you to increase the number of instances running on your cloud at the cost of reducing the performance of the instances. The Compute service uses the following ratios by default:
CPU allocation ratio: 16:1
RAM allocation ratio: 1.5:1
Please read documentation to get more information.
You can change allocation ratio using nova.conf:
cpu_allocation_ratio
ram_allocation_ratio
disk_allocation_ratio
First you need to check the output of "nova service-list" or "openstack compute service list". It should show at least one 'nova-compute' service with state as "Up" and status as 'enabled'.
If the above is fine, then the compute nodes are communicating properly with the scheduler. If not, then you need to check the nova-scheduler logs.
The nova-scheduler has a series of filters like Memory filter, CPU filter, Aggregate filter which it is apply to filter hosts based on the flavor you selected. i.e If you select a flavor with 16GB RAM, then the scheduler will filter (Memory filter) compute hosts which has the available memory. After all the filtering is done the scheduler will try to launch instance on a filtered host, if it fails it will try on another host. Default number of tries is 3. All these can be seen in scheduler logs. That will give you a clear idea on what went wrong.
Also you need to check 'nova show ' output. If you can see the compute host present in "OS-EXT-SRV-ATTR:hypervisor_hostname" , then we can understand that the scheduler was successfully able to allocate a compute host and something went wrong with the compute host. In that case, you need to check the nova-compute logs of that hypervisor.
Finally I found I mount the /var/lib/nova/ to nfs directory /mnt/sdb/var/lib/nova/, but the /mnt/sdb/var/lib/nova/ permission is root:root, so I changed to the nova:nova(same to the /var/lib/nova/).
command :
chown -R nova:nova nova
edit /etc/nova/nova.conf in all the compute node and modify as per your application requirement .
cpu_allocation_ratio = 2.0
( double of physical core can be used for total instance )
ram_allocation_ratio = 2.0
( double of Total memory can be used for total instance
restart nova and nova-scheduler in all the compute node
systemctl restart openstack-nova-*
systemctl restart openstack-nova-scheduler.service