Untitled

from tqdm import tqdm
import os
import sys
import requests
import time
import argparse


parser = argparse.ArgumentParser()
parser.add_argument(
    "--min-gpu-count",
    type=int,
    default=6,
    help="Minimum number of GPUs to rent",
)
parser.add_argument(
    "--max-gpu-count",
    type=int,
    default=8,
    help="Maximum number of GPUs to rent",
)


sys.setrecursionlimit(100_000_000)
p = print


WAIT_TIME = 1
NAME = 'icons-train'
MIN_GPU_COUNT = 6
MAX_GPU_COUNT = 8

IMAGE_NAME = f'runpod/pytorch:2.1.0-py3.10-cuda11.8.0-devel-ubuntu22.04'
GPU_TYPE_ID = 'NVIDIA A100 80GB PCIe'
OS_DISK_SIZE_GB = 100
PERSISTENT_DISK_SIZE_GB = 0
CLOUD_TYPE = 'SECURE'
MIN_DOWNLOAD_SPEED = 700
DATA_CENTER_ID = 'EU-RO-1'
NETWORK_VOLUME_ID = '54epb6rtc4'
TEMPLATE_ID = 'ue50iblx66'
CUDA_VERSION = '12.2'


class CurrentDatetime:

    date = time.strftime('%d %b %Y')
    time = time.strftime('%H:%M:%S')

    def __str__(self):
        return f"{self.date} {self.time}"


def get_caught_error(additional_message=""):  # import sys, os
    exc_type, exc_obj, exc_tb = sys.exc_info()
    fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
    message = (
        f"Now {CurrentDatetime()}\n"
        f"\"{fname}\" caught an error "
        f"{exc_type.__name__} "
        f"on line {exc_tb.tb_lineno}:\n"
        f"\"{exc_obj}\""
        f"\n\n{additional_message}"
    )
    return message


def retry_request_till_success(
    request: requests.models.Request
) -> requests.models.Response:

    def wrapper(*args, **kwargs):
        retry_time_count = 1
        while True:
            retry_in_seconds = 1
            try:
                if ( response := request(*args, **kwargs) ) == None:  # doc.id#13
                    p("There is no request to retry")
                    break
                while True:
                    retry_in_seconds = 1
                    if (r := response.status_code) in [200, 404]:
                        break
                    else:
                        if (
                            response
                        ).status_code not in [200, 404]:
                            p(
                                f"URL: {response.url} "
                                f"Request in {request.__name__} failed. "
                                f"Status_code – {response.status_code} "
                                f"Will retry again in {retry_in_seconds} seconds"
                            )
                            time.sleep(retry_in_seconds)
                            retry_time_count += 1
                        else:
                            break
                return response
            except (
                requests.ConnectTimeout,
                requests.exceptions.ReadTimeout
            ) as e:
                error = get_caught_error()
                p(
                    f"Request in {request.__name__} failed. "
                    f"Timed out with error {error}"
                    f"Will retry again in {retry_in_seconds} seconds"
                )
                time.sleep(retry_in_seconds)
                retry_time_count += 1
            except requests.exceptions.ConnectionError as e:
                sleep_time = 0.1
                p(
                    f"Request in {request.__name__} failed @"
                    f"{CurrentDatetime()}. "
                    "No connection to internet... "
                    f"Will retry again in {sleep_time} seconds"
                )
                time.sleep(sleep_time)
    return wrapper

@retry_request_till_success
def request_with_retries(url, json):
    return requests.post(url, json=json)


class API(object):
    def __init__(self, api_key):
        self.API_KEY = api_key

    def _run_query(self, payload, auth_required=False):
        url = 'https://api.runpod.io/graphql'

        if auth_required:
            url += f'?api_key={self.API_KEY}'

        response = request_with_retries(url, payload)

        return response

    # https://docs.runpod.io/docs/create-pod
    def create_on_demand_pod(self, pod_config):
        return self._run_query({
            "query": """
                mutation {{
                    podFindAndDeployOnDemand(input: {{ {pod_config} }}) {{
                        containerDiskInGb
                        apiKey
                        costPerHr
                        desiredStatus
                        dockerArgs
                        dockerId
                        gpuCount
                        id
                        imageName
                        machineId
                        memoryInGb
                        name
                        podType
                        ports
                        templateId
                        uptimeSeconds
                        vcpuCount
                        version
                        volumeEncrypted
                        volumeInGb
                        volumeKey
                        volumeMountPath
                        runtime {{
                            uptimeInSeconds
                            ports {{
                                ip
                                isIpPublic
                                privatePort
                                publicPort
                                type
                            }}
                            gpus {{
                                id
                                gpuUtilPercent
                                memoryUtilPercent
                            }}
                            container {{
                                cpuPercent
                                memoryPercent
                            }}
                        }}
                        machine {{
                            podHostId
                        }}
                    }}
                }}
            """.format(pod_config=pod_config)
        }, True)


ERRORS = {
    "specs": (
        "There are no longer any instances available with "
        "the requested specifications. Please refresh and try again."
    ),
    "disk": (
        "There are no longer any instances available with "
        "enough disk space."
    )
}


def calculate_gpu(min_gpu_count, max_gpu_count):
    while True:
        for gpu_count in range(max_gpu_count, min_gpu_count - 1, -1):
            yield gpu_count


def create_pod(api, bar, gpu_count_generator):
    gpu_count = next(gpu_count_generator)

    bar.set_description(f"{gpu_count} GPU")
    bar.update(1)
    pod_config = f"""
        cudaVersion: "{CUDA_VERSION}",
        templateId: "{TEMPLATE_ID}",
        networkVolumeId: "{NETWORK_VOLUME_ID}",
        dataCenterId: "{DATA_CENTER_ID}",
        minDownload: {MIN_DOWNLOAD_SPEED},
        gpuCount: {gpu_count},
        volumeInGb: {PERSISTENT_DISK_SIZE_GB},
        containerDiskInGb: {OS_DISK_SIZE_GB},
        gpuTypeId: "{GPU_TYPE_ID}",
        cloudType: {CLOUD_TYPE},
        supportPublicIp: true,
        name: "{NAME}",
        dockerArgs: "",
        volumeMountPath: "/workspace",
        imageName: "{IMAGE_NAME}",
        startJupyter: true,
        startSsh: true,
    """

    response = api.create_on_demand_pod(pod_config)
    resp_json = response.json()

    if response.status_code == 200:
        if 'errors' in resp_json:

            for error in resp_json['errors']:
                if error['message'] == ERRORS['specs']:
                    create_pod(api, bar, gpu_count_generator)
                elif error['message'] == ERRORS['disk']:
                    print(error)
                    print('No instances with enough disk space available, sleeping for 5 seconds')
                    create_pod(api, bar, gpu_count_generator)
                else:
                    print('ERROR: ' + error['message'])
        else:
            return resp_json


def get_pod_and_sound_alarm(api, min_gpu_count=MIN_GPU_COUNT, max_gpu_count=MAX_GPU_COUNT):
    bar = tqdm(desc=f"8 GPU", total=1)
    gpu_count_generator = calculate_gpu(min_gpu_count, max_gpu_count)
    create_pod(api, bar, gpu_count_generator)
    os.system('afplay alarm.mov')


if __name__ == '__main__':
    args = parser.parse_args()
    api = API("XXX")
    get_pod_and_sound_alarm(api, args.min_gpu_count, args.max_gpu_count)