Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from tqdm import tqdm
- import os
- import sys
- import requests
- import time
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "--min-gpu-count",
- type=int,
- default=6,
- help="Minimum number of GPUs to rent",
- )
- parser.add_argument(
- "--max-gpu-count",
- type=int,
- default=8,
- help="Maximum number of GPUs to rent",
- )
- sys.setrecursionlimit(100_000_000)
- p = print
- WAIT_TIME = 1
- NAME = 'icons-train'
- MIN_GPU_COUNT = 6
- MAX_GPU_COUNT = 8
- IMAGE_NAME = f'runpod/pytorch:2.1.0-py3.10-cuda11.8.0-devel-ubuntu22.04'
- GPU_TYPE_ID = 'NVIDIA A100 80GB PCIe'
- OS_DISK_SIZE_GB = 100
- PERSISTENT_DISK_SIZE_GB = 0
- CLOUD_TYPE = 'SECURE'
- MIN_DOWNLOAD_SPEED = 700
- DATA_CENTER_ID = 'EU-RO-1'
- NETWORK_VOLUME_ID = '54epb6rtc4'
- TEMPLATE_ID = 'ue50iblx66'
- CUDA_VERSION = '12.2'
- class CurrentDatetime:
- date = time.strftime('%d %b %Y')
- time = time.strftime('%H:%M:%S')
- def __str__(self):
- return f"{self.date} {self.time}"
- def get_caught_error(additional_message=""): # import sys, os
- exc_type, exc_obj, exc_tb = sys.exc_info()
- fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
- message = (
- f"Now {CurrentDatetime()}\n"
- f"\"{fname}\" caught an error "
- f"{exc_type.__name__} "
- f"on line {exc_tb.tb_lineno}:\n"
- f"\"{exc_obj}\""
- f"\n\n{additional_message}"
- )
- return message
- def retry_request_till_success(
- request: requests.models.Request
- ) -> requests.models.Response:
- def wrapper(*args, **kwargs):
- retry_time_count = 1
- while True:
- retry_in_seconds = 1
- try:
- if ( response := request(*args, **kwargs) ) == None: # doc.id#13
- p("There is no request to retry")
- break
- while True:
- retry_in_seconds = 1
- if (r := response.status_code) in [200, 404]:
- break
- else:
- if (
- response
- ).status_code not in [200, 404]:
- p(
- f"URL: {response.url} "
- f"Request in {request.__name__} failed. "
- f"Status_code – {response.status_code} "
- f"Will retry again in {retry_in_seconds} seconds"
- )
- time.sleep(retry_in_seconds)
- retry_time_count += 1
- else:
- break
- return response
- except (
- requests.ConnectTimeout,
- requests.exceptions.ReadTimeout
- ) as e:
- error = get_caught_error()
- p(
- f"Request in {request.__name__} failed. "
- f"Timed out with error {error}"
- f"Will retry again in {retry_in_seconds} seconds"
- )
- time.sleep(retry_in_seconds)
- retry_time_count += 1
- except requests.exceptions.ConnectionError as e:
- sleep_time = 0.1
- p(
- f"Request in {request.__name__} failed @"
- f"{CurrentDatetime()}. "
- "No connection to internet... "
- f"Will retry again in {sleep_time} seconds"
- )
- time.sleep(sleep_time)
- return wrapper
- @retry_request_till_success
- def request_with_retries(url, json):
- return requests.post(url, json=json)
- class API(object):
- def __init__(self, api_key):
- self.API_KEY = api_key
- def _run_query(self, payload, auth_required=False):
- url = 'https://api.runpod.io/graphql'
- if auth_required:
- url += f'?api_key={self.API_KEY}'
- response = request_with_retries(url, payload)
- return response
- # https://docs.runpod.io/docs/create-pod
- def create_on_demand_pod(self, pod_config):
- return self._run_query({
- "query": """
- mutation {{
- podFindAndDeployOnDemand(input: {{ {pod_config} }}) {{
- containerDiskInGb
- apiKey
- costPerHr
- desiredStatus
- dockerArgs
- dockerId
- gpuCount
- id
- imageName
- machineId
- memoryInGb
- name
- podType
- ports
- templateId
- uptimeSeconds
- vcpuCount
- version
- volumeEncrypted
- volumeInGb
- volumeKey
- volumeMountPath
- runtime {{
- uptimeInSeconds
- ports {{
- ip
- isIpPublic
- privatePort
- publicPort
- type
- }}
- gpus {{
- id
- gpuUtilPercent
- memoryUtilPercent
- }}
- container {{
- cpuPercent
- memoryPercent
- }}
- }}
- machine {{
- podHostId
- }}
- }}
- }}
- """.format(pod_config=pod_config)
- }, True)
- ERRORS = {
- "specs": (
- "There are no longer any instances available with "
- "the requested specifications. Please refresh and try again."
- ),
- "disk": (
- "There are no longer any instances available with "
- "enough disk space."
- )
- }
- def calculate_gpu(min_gpu_count, max_gpu_count):
- while True:
- for gpu_count in range(max_gpu_count, min_gpu_count - 1, -1):
- yield gpu_count
- def create_pod(api, bar, gpu_count_generator):
- gpu_count = next(gpu_count_generator)
- bar.set_description(f"{gpu_count} GPU")
- bar.update(1)
- pod_config = f"""
- cudaVersion: "{CUDA_VERSION}",
- templateId: "{TEMPLATE_ID}",
- networkVolumeId: "{NETWORK_VOLUME_ID}",
- dataCenterId: "{DATA_CENTER_ID}",
- minDownload: {MIN_DOWNLOAD_SPEED},
- gpuCount: {gpu_count},
- volumeInGb: {PERSISTENT_DISK_SIZE_GB},
- containerDiskInGb: {OS_DISK_SIZE_GB},
- gpuTypeId: "{GPU_TYPE_ID}",
- cloudType: {CLOUD_TYPE},
- supportPublicIp: true,
- name: "{NAME}",
- dockerArgs: "",
- volumeMountPath: "/workspace",
- imageName: "{IMAGE_NAME}",
- startJupyter: true,
- startSsh: true,
- """
- response = api.create_on_demand_pod(pod_config)
- resp_json = response.json()
- if response.status_code == 200:
- if 'errors' in resp_json:
- for error in resp_json['errors']:
- if error['message'] == ERRORS['specs']:
- create_pod(api, bar, gpu_count_generator)
- elif error['message'] == ERRORS['disk']:
- print(error)
- print('No instances with enough disk space available, sleeping for 5 seconds')
- create_pod(api, bar, gpu_count_generator)
- else:
- print('ERROR: ' + error['message'])
- else:
- return resp_json
- def get_pod_and_sound_alarm(api, min_gpu_count=MIN_GPU_COUNT, max_gpu_count=MAX_GPU_COUNT):
- bar = tqdm(desc=f"8 GPU", total=1)
- gpu_count_generator = calculate_gpu(min_gpu_count, max_gpu_count)
- create_pod(api, bar, gpu_count_generator)
- os.system('afplay alarm.mov')
- if __name__ == '__main__':
- args = parser.parse_args()
- api = API("XXX")
- get_pod_and_sound_alarm(api, args.min_gpu_count, args.max_gpu_count)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement