Advertisement
kopyl

Untitled

Jan 12th, 2024
1,019
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.37 KB | None | 0 0
  1. import datasets
  2. import json
  3. from PIL import Image
  4.  
  5.  
  6. _FEATURES = datasets.Features(
  7.     {
  8.         "image": datasets.Image(),
  9.         "text": datasets.Value("string"),
  10.         "similarity": datasets.Value("float32"),
  11.         "hash": datasets.Value("int64"),
  12.         "punsafe": datasets.Value("float32"),
  13.         "pwatermark": datasets.Value("float32"),
  14.         "AESTHETIC_SCORE": datasets.Value("float32"),
  15.         "url": datasets.Value("string"),
  16.         "key": datasets.Value("string"),
  17.         "width": datasets.Value("int64"),
  18.         "height": datasets.Value("int64"),
  19.         "ratio": datasets.Value("float32"),
  20.         "exif": datasets.Value("string"),
  21.         "sha256": datasets.Value("string"),
  22.         "status": datasets.Value("string"),
  23.         "error_message": datasets.Value("string"),
  24.         "original_width": datasets.Value("int64"),
  25.         "original_height": datasets.Value("int64"),
  26.     },
  27. )
  28. _VERSION = datasets.Version("0.0.2")
  29. _DEFAULT_CONFIG = datasets.BuilderConfig(name="default", version=_VERSION)
  30. _DESCRIPTION = (
  31.     "Sample dataset of 100 images of different ratio from "
  32.     "[LAION Improved Aesthetics 6plus](https://huggingface.co/datasets/ChristophSchuhmann/improved_aesthetics_6plus)\n\n"
  33.     "I need this to see whether i can train a model on images with different ratios"
  34. )
  35.  
  36.  
  37. class SampleLAIONDatasetDifferentRatio(datasets.GeneratorBasedBuilder):
  38.     BUILDER_CONFIGS = [_DEFAULT_CONFIG]
  39.     DEFAULT_CONFIG_NAME = "default"
  40.  
  41.     def _info(self):
  42.         return datasets.DatasetInfo(
  43.             description=_DESCRIPTION,
  44.             features=_FEATURES,
  45.         )
  46.  
  47.     def _split_generators(self, dl_manager):
  48.         metadata_path = "/100-files.txt"
  49.         images_dir = '/preprocessed_2256k/train'
  50.  
  51.         return [
  52.             datasets.SplitGenerator(
  53.                 name=datasets.Split.TRAIN,
  54.                 gen_kwargs={
  55.                     "metadata_path": metadata_path,
  56.                     "images_dir": images_dir,
  57.                 },
  58.             ),
  59.         ]
  60.  
  61.     def _generate_examples(self, metadata_path, images_dir):
  62.         with open(metadata_path) as f:
  63.             metadata = json.load(f)
  64.  
  65.         for file_name in metadata:
  66.             image = Image.open(f"{images_dir}/{file_name}.jpg")
  67.             with open(f"{images_dir}/{file_name}.txt") as f:
  68.                 text = f.read()
  69.  
  70.             with open(f"{images_dir}/{file_name}.json") as f:
  71.                 data = json.load(f)
  72.                
  73.             to_return = {
  74.                 "image": image,
  75.                 "text": text,
  76.                 "ratio": data["width"] / data["height"],
  77.                
  78.                 "width": data["width"],
  79.                 "height": data["height"],
  80.                 "similarity": data["similarity"],
  81.                 "hash": data["hash"],
  82.                 "punsafe": data["punsafe"],
  83.                 "pwatermark": data["pwatermark"],
  84.                 "AESTHETIC_SCORE": data["AESTHETIC_SCORE"],
  85.                 "url": data["url"],
  86.                 "key": data["key"],
  87.                 "exif": data["exif"],
  88.                 "sha256": data["sha256"],
  89.                 "status": data["status"],
  90.                 "error_message": data["error_message"],
  91.                 "original_width": data["original_width"],
  92.                 "original_height": data["original_height"],
  93.             }
  94.  
  95.             yield file_name, to_return
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement