Untitled

WAIT_TIME = 1
NAME = 'icons-train'

IMAGE_NAME = f'runpod/pytorch:2.1.0-py3.10-cuda11.8.0-devel-ubuntu22.04'
GPU_TYPE_ID = 'NVIDIA A100 80GB PCIe'
OS_DISK_SIZE_GB = 100
PERSISTENT_DISK_SIZE_GB = 0
CLOUD_TYPE = 'SECURE'
MIN_DOWNLOAD_SPEED = 700
DATA_CENTER_ID = 'EU-RO-1'
NETWORK_VOLUME_ID = '54epb6rtc4'
TEMPLATE_ID = 'ue50iblx66'
CUDA_VERSION = '12.2'
GPU_COUNT = 8

ERRORS = {
    "specs": (
        "There are no longer any instances available with "
        "the requested specifications. Please refresh and try again."
    ),
    "disk": (
        "There are no longer any instances available with "
        "enough disk space."
    )
}


def create_pod(api, bar):
    bar.update(1)
    pod_config = f"""
        cudaVersion: "{CUDA_VERSION}",
        templateId: "{TEMPLATE_ID}",
        networkVolumeId: "{NETWORK_VOLUME_ID}",
        dataCenterId: "{DATA_CENTER_ID}",
        minDownload: {MIN_DOWNLOAD_SPEED},
        gpuCount: {GPU_COUNT},
        volumeInGb: {PERSISTENT_DISK_SIZE_GB},
        containerDiskInGb: {OS_DISK_SIZE_GB},
        gpuTypeId: "{GPU_TYPE_ID}",
        cloudType: {CLOUD_TYPE},
        supportPublicIp: true,
        name: "{NAME}",
        dockerArgs: "",
        volumeMountPath: "/workspace",
        imageName: "{IMAGE_NAME}",
        startJupyter: true,
        startSsh: true,
    """

    response = api.create_on_demand_pod(pod_config)
    resp_json = response.json()

    if response.status_code == 200:
        if 'errors' in resp_json:

            for error in resp_json['errors']:
                if error['message'] == ERRORS['specs']:
                    time.sleep(WAIT_TIME)
                    create_pod(api, bar)
                elif error['message'] == ERRORS['disk']:
                    print(error)
                    print('No instances with enough disk space available, sleeping for 5 seconds')
                    time.sleep(WAIT_TIME)
                    create_pod(api, bar)
                else:
                    print('ERROR: ' + error['message'])
        else:
            return


bar = tqdm()

api = API("XXX")
res = create_pod(api, bar)
res