Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import datasets
- import json
- from PIL import Image
- _FEATURES = datasets.Features(
- {
- "image": datasets.Image(),
- "text": datasets.Value("string"),
- "similarity": datasets.Value("float32"),
- "hash": datasets.Value("int64"),
- "punsafe": datasets.Value("float32"),
- "pwatermark": datasets.Value("float32"),
- "AESTHETIC_SCORE": datasets.Value("float32"),
- "url": datasets.Value("string"),
- "key": datasets.Value("string"),
- "width": datasets.Value("int64"),
- "height": datasets.Value("int64"),
- "ratio": datasets.Value("float32"),
- "exif": datasets.Value("string"),
- "sha256": datasets.Value("string"),
- "status": datasets.Value("string"),
- "error_message": datasets.Value("string"),
- "original_width": datasets.Value("int64"),
- "original_height": datasets.Value("int64"),
- },
- )
- _VERSION = datasets.Version("0.0.2")
- _DEFAULT_CONFIG = datasets.BuilderConfig(name="default", version=_VERSION)
- _DESCRIPTION = (
- "Sample dataset of 100 images of different ratio from "
- "[LAION Improved Aesthetics 6plus](https://huggingface.co/datasets/ChristophSchuhmann/improved_aesthetics_6plus)\n\n"
- "I need this to see whether i can train a model on images with different ratios"
- )
- class SampleLAIONDatasetDifferentRatio(datasets.GeneratorBasedBuilder):
- BUILDER_CONFIGS = [_DEFAULT_CONFIG]
- DEFAULT_CONFIG_NAME = "default"
- def _info(self):
- return datasets.DatasetInfo(
- description=_DESCRIPTION,
- features=_FEATURES,
- )
- def _split_generators(self, dl_manager):
- metadata_path = "/100-files.txt"
- images_dir = '/preprocessed_2256k/train'
- return [
- datasets.SplitGenerator(
- name=datasets.Split.TRAIN,
- gen_kwargs={
- "metadata_path": metadata_path,
- "images_dir": images_dir,
- },
- ),
- ]
- def _generate_examples(self, metadata_path, images_dir):
- with open(metadata_path) as f:
- metadata = json.load(f)
- for file_name in metadata:
- image = Image.open(f"{images_dir}/{file_name}.jpg")
- with open(f"{images_dir}/{file_name}.txt") as f:
- text = f.read()
- with open(f"{images_dir}/{file_name}.json") as f:
- data = json.load(f)
- to_return = {
- "image": image,
- "text": text,
- "ratio": data["width"] / data["height"],
- "width": data["width"],
- "height": data["height"],
- "similarity": data["similarity"],
- "hash": data["hash"],
- "punsafe": data["punsafe"],
- "pwatermark": data["pwatermark"],
- "AESTHETIC_SCORE": data["AESTHETIC_SCORE"],
- "url": data["url"],
- "key": data["key"],
- "exif": data["exif"],
- "sha256": data["sha256"],
- "status": data["status"],
- "error_message": data["error_message"],
- "original_width": data["original_width"],
- "original_height": data["original_height"],
- }
- yield file_name, to_return
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement