Source code for dgenerate.imageprocessors.adetailer

# Copyright (c) 2023, Teriks
#
# dgenerate is distributed under the following BSD 3-Clause License
#
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in
#    the documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import PIL.Image
import PIL.ImageOps
import diffusers
import torch

import dgenerate.extras.asdff.base as _asdff
import dgenerate.image as _image
import dgenerate.imageprocessors.imageprocessor as _imageprocessor
import dgenerate.messages
import dgenerate.pipelinewrapper as _pipelinewrapper
import dgenerate.pipelinewrapper.constants as _constants
import dgenerate.pipelinewrapper.enums as _enums
import dgenerate.pipelinewrapper.uris as _uris
import dgenerate.promptweighters as _promptweighters
import dgenerate.textprocessing as _textprocessing
import dgenerate.types as _types
import dgenerate.imageprocessors.util as _util



[docs]
class AdetailerProcessor(_imageprocessor.ImageProcessor):
    r"""
    adetailer, diffusion based post processor for SD1.5, SDXL, Kolors, SD3, and Flux

    adetailer can detect features of your image and automatically generate an inpaint
    mask for them, such as faces, hands etc. and then re-run diffusion over those portions
    of the image using inpainting to enhance detail.

    This image processor may only be used if a diffusion pipeline has been
    previously executed by dgenerate, that pipeline will be used to process
    the inpainting done by adetailer. For a single command line invocation
    you must use --post-processors to use this image processor correctly. In
    dgenerate config script, you may use it anywhere, and the last executed
    diffusion pipeline will be reused for inpainting.

    Inpainting will occur on the device used by the last executed diffusion
    pipeline unless the "device" argument is specified, the detector model can be run on
    an alternate GPU if desired using the "detector-device" argument, otherwise
    the detector will run on "device".

    Example:

    NOWRAP!
    --post-processors "adetailer;\
                       model=Bingsu/adetailer;\
                       weight-name=face_yolov8n.pt;\
                       prompt=detailed image of a mans face;\
                       negative-prompt=nsfw, blurry, disfigured;\
                       guidance-scale=7;\
                       inference-steps=30;\
                       strength=0.4"

    -----

    The "model" argument specifies which YOLO model to use. This can be a path to a local
    model file, a URL to download the model from, or a HuggingFace repository slug / blob link.

    The "prompt" argument specifies the positive prompt to use for inpainting.

    The "negative-prompt" argument specifies the negative prompt for inpainting.

    The "prompt-weighter" argument specifies a prompt weighter plugin for applying
    prompt weighting to the provided positive and negative prompts. Prompt weighters
    may have arguments, when supplying URI arguments to a prompt weighter you must
    use double quoting around the prompt weighter definition, i.e:
    --post-processors "adetailer;model=...;prompt=test;prompt-weighter='compel;syntax=sdwui'"

    The "weight-name" argument specifies the file name in a HuggingFace repository
    for the model weights, if you have provided a HuggingFace repository slug to the
    model argument.

    The "subfolder" argument specifies the subfolder in a HuggingFace repository
    for the model weights, if you have provided a HuggingFace repository slug to the
    model argument.

    The "revision" argument specifies the revision of a HuggingFace repository
    for the model weights, if you have provided a HuggingFace repository slug to the
    model argument. For example: "main"

    The "token" argument specifies your HuggingFace authentication token explicitly
    if needed.

    The "local-files-only" argument specifies that dgenerate should not attempt to
    download any model files, and to only look for them locally in the cache or
    otherwise.

    The "seed" argument can be used to specify a specific seed for diffusion
    when performing inpainting on the input image.

    The "inference-steps" argument specifies the amount of inference steps
    when performing inpainting on the input image.

    The "guidance-scale" argument specifies the guidance scale for inpainting.

    The "pag-scale" argument indicates the perturbed attention guidance scale,
    this enables a PAG inpaint pipeline if supported. If the previously used
    pipeline was a PAG pipeline, PAG is automatically enabled for inpainting
    if supported and this value defaults to 3.0 if not supplied. The adetailer
    processor supports PAG with --model-type sd and sdxl.

    The "pag-adaptive-scale" argument indicates the perturbed attention guidance
    adaptive scale, this enables a PAG inpaint pipeline if supported.
    If the previously usee pipeline was a PAG pipeline, PAG is automatically
    enabled for inpainting if supported and this value defaults to 0.0 if
    not supplied. The adetailer processor supports PAG with
    --model-type sd and sdxl.

    The "strength" argument is analogous to --image-seed-strengths

    The "class-filter" argument can be used to detect only specific classes. This should be a
    comma-separated list of class IDs or class names, or a single value, for example: "0,2,person,car".
    This filter is applied before "index-filter".

    Example "class-filter" values:

        NOWRAP!
        # Only keep detection class ID 0
        class-filter=0

        NOWRAP!
        # Only keep detection class "hand"
        class-filter=hand

        NOWRAP!
        # keep class ID 2,3
        class-filter=2,3

        NOWRAP!
        # keep class ID 0 & class Name "hand"
        # if entry cannot be parsed as an integer
        # it is interpreted as a name
        class-filter=0,hand

        NOWRAP!
        # "0" is interpreted as a name and not an ID,
        # this is not likely to be useful
        class-filter="0",hand

        NOWRAP!
        # List syntax is supported, you must quote
        # class names
        index-filter=[0, "hand"]

    The "index-filter" argument is a list values or a single value that indicates
    what YOLO detection indices to keep, the index values start at zero. Detections are
    sorted by their top left bounding box coordinate from left to right, top to bottom,
    by (confidence descending). The order of detections in the image is identical to
    the reading order of words on a page (english). Inpainting will only be
    performed on the specified detection indices, if no indices are specified, then
    inpainting will be performed on all detections.

    Example "index-filter" values:

        NOWRAP!
        # keep the first, leftmost, topmost detection
        index-filter=0

        NOWRAP!
        # keep detections 1 and 3
        index-filter=[1, 3]

        NOWRAP!
        # CSV syntax is supported (tuple)
        index-filter=1,3

    The "detector-padding" argument specifies the amount of padding
    that will be added to the detection rectangle which is used to
    generate a masked area. The default is 0, you can make the mask
    area around the detected feature larger with positive padding
    and smaller with negative padding.

    Padding examples:

        NOWRAP!
        32 (32px Uniform, all sides)

        NOWRAP!
        10x20 (10px Horizontal, 20px Vertical)

        NOWRAP!
        10x20x30x40 (10px Left, 20px Top, 30px Right, 40px Bottom)

    The "mask-padding" argument indicates how much padding to place around
    the masked area when cropping out the image to be inpainted. This value must be
    large enough to accommodate any feathering on the edge of the mask caused
    by "mask-blur" or "mask-dilation" for the best result, the default value is 32.
    The syntax for specifying this value is identical to "detector-padding".

    The "mask-shape" argument indicates what mask shape adetailer should
    attempt to draw around a detected feature, the default value is "rectangle".
    You may also specify "circle" to generate an ellipsoid shaped mask, which
    might be helpful for achieving better blending.

    The "mask-blur" argument indicates the level of gaussian blur to apply
    to the generated inpaint mask, which can help with smooth blending in
    of the inpainted feature

    The "mask-dilation" argument indicates the amount of dilation applied
    to the inpaint mask, see: cv2.dilate

    The "model-masks" argument indicates that masks generated by the
    model itself should be preferred over masks generated from the
    detection bounding box. If this is True, and the model itself
    returns mask data, "mask-shape", "mask-padding",
    and "detector-padding" will all be ignored.

    The "confidence" argument can be used to adjust the confidence
    value for the YOLO detector model. Defaults to: 0.3

    The "detector-device" argument can be used to specify a device
    override for the YOLO detector, i.e. the GPU / Accelerate device
    the model will run on. Example: cuda:0, cuda:1, cpu

    The "size" argument specifies the target size for processing detected areas.
    When specified, detected areas will always be scaled to this target size (with aspect ratio preserved)
    for processing, then scaled back to the original size for compositing.
    This can significantly improve detail quality for small detected features like faces or hands,
    or reduce processing time for overly large detected areas.
    
    The scaling is based on the larger dimension (width or height) of the detected area.
    If the detected area's larger dimension is smaller than the target size, it will be upscaled.
    If the detected area's larger dimension is larger than the target size, it will be downscaled.
    Scaling is always performed when this argument is specified.
    
    The value must be an integer greater than 1. The optimal resampling method is automatically
    selected based on whether upscaling or downscaling is needed.

    Example: size=1024 (always process detected areas at 1024px for the larger dimension)

    The "pre-resize" argument determines if the processing occurs before or after dgenerate resizes the image.
    This defaults to False, meaning the image is processed after dgenerate is done resizing it.
    """

    NAMES = ['adetailer']

    HIDE_ARGS = ['pipe', 'model-offload']

    OPTION_ARGS = {
        'mask-shape': ['r', 'rect', 'rectangle', 'c', 'circle', 'ellipse'],
    }

    FILE_ARGS = {
        'model': {'mode': 'in', 'filetypes': [('Models', ['*.safetensors', '*.pt', '*.pth', '*.cpkt', '*.bin'])]},
    }


[docs]
    def __init__(self,
                 model: str,
                 prompt: str,
                 negative_prompt: str | None = None,
                 prompt_weighter: str | None = None,
                 weight_name: str | None = None,
                 subfolder: str | None = None,
                 revision: str | None = None,
                 token: str | None = None,
                 seed: int | None = None,
                 inference_steps: int = 30,
                 guidance_scale: float = 5,
                 pag_scale: float | None = None,
                 pag_adaptive_scale: float | None = None,
                 strength: float = 0.4,
                 detector_padding: int | str = _constants.DEFAULT_ADETAILER_DETECTOR_PADDING,
                 mask_shape: str = _constants.DEFAULT_ADETAILER_MASK_SHAPE,
                 class_filter: int | str | list | tuple | set | None = None,
                 index_filter: int | list | tuple | set | None = None,
                 mask_padding: int | str = _constants.DEFAULT_ADETAILER_MASK_PADDING,
                 mask_blur: int = _constants.DEFAULT_ADETAILER_MASK_BLUR,
                 mask_dilation: int = _constants.DEFAULT_ADETAILER_MASK_DILATION,
                 model_masks: bool = False,
                 confidence: float = _constants.DEFAULT_ADETAILER_DETECTOR_CONFIDENCE,
                 detector_device: _types.OptionalName = None,
                 size: int | None = None,
                 pipe: diffusers.DiffusionPipeline = None,
                 pre_resize: bool = False,
                 **kwargs):
        """
        :param kwargs: forwarded to base class
        """
        super().__init__(**kwargs)

        if not isinstance(mask_padding, int):
            try:
                mask_padding = _textprocessing.parse_dimensions(mask_padding)

                if len(mask_padding) not in {1, 2, 4}:
                    raise ValueError()

            except ValueError:
                raise self.argument_error(
                    'mask-padding must be an integer value, WIDTHxHEIGHT, or LEFTxTOPxRIGHTxBOTTOM')

            if len(mask_padding) == 1:
                mask_padding = mask_padding[0]

        if not isinstance(detector_padding, int):
            try:
                detector_padding = _textprocessing.parse_dimensions(detector_padding)

                if len(detector_padding) not in {1, 2, 4}:
                    raise ValueError()

            except ValueError as e:
                raise self.argument_error(
                    'detector-padding must be an integer value, WIDTHxHEIGHT, or LEFTxTOPxRIGHTxBOTTOM') from e

            if len(detector_padding) == 1:
                detector_padding = detector_padding[0]

        mask_shape = mask_shape.lower()

        # Parse detection filters
        self._class_filter, self._index_filter = _util.yolo_filters_parse(
            class_filter,
            index_filter,
            self.argument_error
        )

        if mask_shape not in {'rectangle', 'circle'}:
            raise self.argument_error('mask-shape must be either "rectangle" or "circle".')

        if mask_blur < 0:
            raise self.argument_error('mask-blur may not be less than zero.')

        if mask_dilation < 0:
            raise self.argument_error('mask-dilation may not be less than zero.')

        if inference_steps <= 0:
            raise self.argument_error('inference-steps must be greater than zero.')

        if guidance_scale < 0:
            raise self.argument_error('guidance-scale may not be less than zero.')

        if pag_scale is not None and pag_scale < 0:
            raise self.argument_error('pag-scale may not be less than zero.')

        if pag_adaptive_scale is not None and pag_adaptive_scale < 0:
            raise self.argument_error('pag-adaptive-scale may not be less than zero.')

        if strength < 0:
            raise self.argument_error('strength may not be less than zero.')

        if strength > 1:
            raise self.argument_error('strength may not be greater than 1.')

        if confidence < 0.0:
            raise self.argument_error('confidence may not be less than 0.')

        if size is not None and size <= 1:
            raise self.argument_error('size must be an integer greater than 1.')

        self._prompt = prompt
        self._negative_prompt = negative_prompt
        self._detector_padding = detector_padding
        self._mask_shape = mask_shape
        self._mask_padding = mask_padding
        self._mask_blur = mask_blur
        self._mask_dilation = mask_dilation
        self._model_masks = model_masks
        self._inference_steps = inference_steps
        self._guidance_scale = guidance_scale
        self._pag_scale = pag_scale
        self._pag_adaptive_scale = pag_adaptive_scale
        self._strength = strength
        self._seed = seed
        self._prompt_weighter = prompt_weighter
        self._detector_device = detector_device
        self._confidence = confidence
        self._size = size

        self._pre_resize = pre_resize
        self._pipe = pipe

        try:
            self._model_path = _uris.AdetailerDetectorUri(
                model=model,
                revision=revision,
                subfolder=subfolder,
                weight_name=weight_name,
                class_filter=self._class_filter,
                index_filter=self._index_filter,
                mask_shape=self._mask_shape,
                detector_padding=self._detector_padding,
                mask_padding=self._mask_padding,
                mask_blur=self._mask_blur,
                mask_dilation=self._mask_dilation,
                model_masks=self._model_masks,
                confidence=self._confidence,
                prompt=self._prompt,
                negative_prompt=self._negative_prompt,
                device=self._detector_device
            ).get_model_path(
                use_auth_token=token,
                local_files_only=self.local_files_only)
        except Exception as e:
            raise self.argument_error(str(e)) from e


    def _adetailer(self, image):
        i_filename = _image.get_filename(image)

        if self._pipe:
            last_pipe = self._pipe
        else:
            last_pipe = _pipelinewrapper.DiffusionPipelineWrapper.recall_last_used_main_pipeline()
            if last_pipe is not None:
                # we only want the primary pipe, not the sdxl refiner for instance
                last_pipe = last_pipe.pipeline

        if last_pipe is None:
            raise self.argument_error(
                'adetailer could not find the last image generation pipeline that was used '
                'for image generation, please perform an image generation operation before attempting to '
                'use this processor. This processor is best used with the --post-processors option '
                'of dgenerate. It is possible however, to use this processor elsewhere in a config '
                'script if image generation has occurred previously. It will re-use the last '
                'image generation pipelines components for inpainting.')

        is_flux = last_pipe.__class__.__name__.startswith('Flux') and \
                  not isinstance(last_pipe, diffusers.FluxFillPipeline)

        is_sdxl = last_pipe.__class__.__name__.startswith('StableDiffusionXL')
        is_sd3 = last_pipe.__class__.__name__.startswith('StableDiffusion3')
        is_sd = last_pipe.__class__.__name__.startswith('StableDiffusion') and not is_sd3 and not is_sdxl
        is_kolors = last_pipe.__class__.__name__.startswith('Kolors')

        ad_pipe = _asdff.AdPipelineBase(last_pipe)

        pipeline_args = {
            "num_inference_steps": self._inference_steps,
            "guidance_scale": self._guidance_scale,
            "prompt": self._prompt
        }

        if self._pag_scale is not None or \
                self._pag_adaptive_scale is not None:
            if not (is_sd or is_sdxl):
                raise self.argument_error(
                    'adetailer arguments "pag-scale" and "pag-adaptive-scale" may not '
                    'be used with anything other than --model-type sd and sdxl')

            ad_pipe.force_pag = True

            if self._pag_scale is not None:
                pipeline_args['pag_scale'] = self._pag_scale
            if self._pag_adaptive_scale is not None:
                pipeline_args['pag_adaptive_scale'] = self._pag_adaptive_scale

        if is_sdxl:
            pipeline_args['target_size'] = image.size

        if not is_flux:
            pipeline_args['negative_prompt'] = self._negative_prompt
        elif self._negative_prompt:
            dgenerate.messages.log(
                'adetailer is ignoring negative prompt, as Flux does not support negative prompting.')

        prompt_weighter = None

        if self._prompt_weighter:
            loader = _promptweighters.PromptWeighterLoader()

            if is_flux:
                model_type = _enums.ModelType.FLUX
            elif is_sdxl:
                model_type = _enums.ModelType.SDXL
            elif is_kolors:
                model_type = _enums.ModelType.KOLORS
            elif is_sd3:
                model_type = _enums.ModelType.SD3
            elif is_sd:
                model_type = _enums.ModelType.SD
            else:
                raise self.argument_error(
                    f'Pipeline: "{last_pipe.__class__.__name__}" does not support adetailer use.')

            if last_pipe.text_encoder is not None:
                encoder_dtype = next(last_pipe.text_encoder.parameters()).dtype
            elif hasattr(last_pipe, 'text_encoder2') and last_pipe.text_encoder2 is not None:
                encoder_dtype = next(last_pipe.text_encoder2.parameters()).dtype
            elif hasattr(last_pipe, 'text_encoder3') and last_pipe.text_encoder3 is not None:
                encoder_dtype = next(last_pipe.text_encoder3.parameters()).dtype
            else:
                raise self.argument_error(
                    'adetailer processor could not determine text encoder dtype for prompt weighting.')

            encoder_dtype = _enums.get_data_type_enum(str(encoder_dtype).lstrip('torch.'))

            try:
                prompt_weighter = loader.load(
                    self._prompt_weighter,
                    model_type=model_type,
                    dtype=encoder_dtype,
                    local_files_only=self.local_files_only,
                    device=self.device
                )
            except Exception as e:
                raise self.argument_error(str(e)) from e

        if self._seed is not None:
            generator = torch.Generator(
                device=self.device).manual_seed(self._seed)
            pipeline_args['generator'] = generator

        pipeline_args['strength'] = self._strength

        result = ad_pipe(
            pipeline_args=pipeline_args,
            images=[image],
            mask_shape=self._mask_shape,
            mask_dilation=self._mask_dilation,
            mask_blur=self._mask_blur,
            mask_padding=self._mask_padding,
            model_masks=self._model_masks,
            detector_padding=self._detector_padding,
            model_path=self._model_path,
            device=self.device,
            detector_device=_types.default(self._detector_device, self.device),
            confidence=self._confidence,
            prompt_weighter=prompt_weighter,
            class_filter=self._class_filter,
            index_filter=self._index_filter,
            processing_size=self._size
        )

        if len(result.images) > 0:
            output_image = result.images[0]
            output_image.filename = i_filename
        else:
            output_image = image

        return output_image


[docs]
    def impl_pre_resize(self, image: PIL.Image.Image, resize_resolution: _types.OptionalSize):
        if self._pre_resize:
            return self._adetailer(image)
        return image



[docs]
    def impl_post_resize(self, image: PIL.Image.Image):
        if not self._pre_resize:
            return self._adetailer(image)
        return image



[docs]
    def to(self, device) -> "AdetailerProcessor":
        """
        Does nothing for this processor.

        :param device: the device
        :return: this processor
        """
        return self