DVCR-i-AI-Artist

Text2image AI { “nbformat”: 4, “nbformat_minor”: 0, “metadata”: { “colab”: { “private_outputs”: true, “provenance”: [], “collapsed_sections”: [ “1tthw0YaispD” ], “machine_shape”: “hm”, “include_colab_link”: true }, “kernelspec”: { “name”: “python3”, “display_name”: “Python 3” }, “language_info”: { “name”: “python” }, “accelerator”: “GPU” }, “cells”: [ { “cell_type”: “markdown”, “metadata”: { “id”: “view-in-github”, “colab_type”: “text” }, “source”: [ “<a href="https://colab.research.google.com/github/isaacandy/DVCR-i-AI-Artist/blob/main/create_realistic_ai_generated_images_with_dvc_ri.ipynb" target="parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>” ] }, { “cell_type”: “markdown”, “metadata”: { “id”: “clJsMT0Eqizk” }, “source”: [ “# Create Text to Image AI-Generated Images With DALL-E combining VQGAN + CLIP\n”, “\n”, “by Isaac Andy. \n”, “\n”, “This notebook allows you to create realistic AI generated images with as few clicks as possible for free! No coding or machine learning knowledge required!\n”, “\n”, “This notebook is forked with significant usability and technical optimizations from the original Colab notebook by @ak92501 which includes an implementation of VQGAN + CLIP w/ Pooling. The Notebook was originally made by Katherine Crowson (https://github.com/crowsonkb, https://twitter.com/RiversHaveWings). The original BigGAN+CLIP method was by https://twitter.com/advadnoun. Added some explanations and modifications by Eleiber#8347, pooling trick by Crimeacs#8222 (https://twitter.com/EarthML1). For more elaborate customization, see the original notebook or Zoetrope 5 by @classpectanon.\n”, “\n”, “To get started:\n”, “\n”, “1. Copy this notebook to your Google Drive to keep it and save your changes. (File -> Save a Copy in Drive)\n”, “2. Run the cells below by clicking the Play button on the left of the cell (also visible when mousing-over the cell)\n”, “\n”, “_Last Updated: Aug 22th 2021\n” ] }, { “cell_type”: “markdown”, “metadata”: { “id”: “CppIQlPhhwhs” }, “source”: [ “## Setup” ] }, { “cell_type”: “code”, “metadata”: { “id”: “TkUfzT60ZZ9q”, “cellView”: “form” }, “source”: [ “#@title Check GPU\n”, “#@markdown Run this cell to see what GPU the Colab Notebook is running. Ideally, it’s not a K80 which is the slowest one.\n”, “\n”, “!nvidia-smi” ], “execution_count”: null, “outputs”: [] }, { “cell_type”: “code”, “metadata”: { “id”: “VA1PHoJrRiK9”, “cellView”: “form” }, “source”: [ “#@title Download Models and Install/Load Packages (may take a few minutes)\n”, “\n”, “!git clone https://github.com/openai/CLIP\n”, “!git clone https://github.com/CompVis/taming-transformers.git\n”, “!git clone https://github.com/minimaxir/icon-image.git\n”, “!pip install Pillow numpy fire icon_font_to_png\n”, “!pip install ftfy regex tqdm omegaconf pytorch-lightning\n”, “!pip install kornia\n”, “!pip install imageio-ffmpeg \n”, “!pip install einops\n”, “!pip install imagio \n”, “!mkdir steps\n”, “\n”, “print("Downloading ImageNet 16384")\n”, “\n”, “!curl -L -o vqgan_imagenet_f1616384.ckpt -C - ‘https://heibox.uni-heidelberg.de/f/867b05fc8c4841768640/?dl=1’\n”, “!curl -L -o vqgan_imagenet_f16_16384.yaml -C - ‘https://heibox.uni-heidelberg.de/f/274fb24ed38341bfa753/?dl=1’\n”, “\n”, “import argparse\n”, “import math\n”, “from pathlib import Path\n”, “import sys\n”, “\n”, “sys.path.insert(1, ‘/content/taming-transformers’)\n”, “sys.path.insert(1, ‘/content/icon-image’)\n”, “\n”, “from icon_image import gen_icon\n”, “from IPython import display\n”, “from base64 import b64encode\n”, “from omegaconf import OmegaConf\n”, “from PIL import Image\n”, “from PIL.PngImagePlugin import PngInfo\n”, “from taming.models import cond_transformer, vqgan\n”, “import taming.modules \n”, “import torch\n”, “from torch import nn, optim\n”, “from torch.nn import functional as F\n”, “from torchvision import transforms\n”, “from torchvision.transforms import functional as TF\n”, “from torch.optim.lr_scheduler import StepLR\n”, “from tqdm.notebook import tqdm\n”, “from shutil import move\n”, “import os\n”, “\n”, “from CLIP import clip\n”, “import kornia.augmentation as K\n”, “import numpy as np\n”, “import imageio\n”, “from PIL import ImageFile, Image\n”, “ImageFile.LOAD_TRUNCATED_IMAGES = True\n”, “\n”, “def sinc(x):\n”, “ return torch.where(x != 0, torch.sin(math.pi * x) / (math.pi * x), x.new_ones([]))\n”, “\n”, “\n”, “def lanczos(x, a):\n”, “ cond = torch.logical_and(-a < x, x < a)\n”, “ out = torch.where(cond, sinc(x) * sinc(x/a), x.new_zeros([]))\n”, “ return out / out.sum()\n”, “\n”, “\n”, “def ramp(ratio, width):\n”, “ n = math.ceil(width / ratio + 1)\n”, “ out = torch.empty([n])\n”, “ cur = 0\n”, “ for i in range(out.shape[0]):\n”, “ out[i] = cur\n”, “ cur += ratio\n”, “ return torch.cat([-out[1:].flip([0]), out])[1:-1]\n”, “\n”, “\n”, “def resample(input, size, align_corners=True):\n”, “ n, c, h, w = input.shape\n”, “ dh, dw = size\n”, “\n”, “ input = input.view([n * c, 1, h, w])\n”, “\n”, “ if dh < h:\n”, “ kernel_h = lanczos(ramp(dh / h, 2), 2).to(input.device, input.dtype)\n”, “ pad_h = (kernel_h.shape[0] - 1) // 2\n”, “ input = F.pad(input, (0, 0, pad_h, pad_h), ‘reflect’)\n”, “ input = F.conv2d(input, kernel_h[None, None, :, None])\n”, “\n”, “ if dw < w:\n”, “ kernel_w = lanczos(ramp(dw / w, 2), 2).to(input.device, input.dtype)\n”, “ pad_w = (kernel_w.shape[0] - 1) // 2\n”, “ input = F.pad(input, (pad_w, pad_w, 0, 0), ‘reflect’)\n”, “ input = F.conv2d(input, kernel_w[None, None, None, :])\n”, “\n”, “ input = input.view([n, c, h, w])\n”, “ return F.interpolate(input, size, mode=’bicubic’, align_corners=align_corners)\n”, “\n”, “\n”, “class ReplaceGrad(torch.autograd.Function):\n”, “ @staticmethod\n”, “ def forward(ctx, x_forward, x_backward):\n”, “ ctx.shape = x_backward.shape\n”, “ return x_forward\n”, “\n”, “ @staticmethod\n”, “ def backward(ctx, grad_in):\n”, “ return None, grad_in.sum_to_size(ctx.shape)\n”, “\n”, “\n”, “replace_grad = ReplaceGrad.apply\n”, “\n”, “\n”, “class ClampWithGrad(torch.autograd.Function):\n”, “ @staticmethod\n”, “ def forward(ctx, input, min, max):\n”, “ ctx.min = min\n”, “ ctx.max = max\n”, “ ctx.save_for_backward(input)\n”, “ return input.clamp(min, max)\n”, “\n”, “ @staticmethod\n”, “ def backward(ctx, grad_in):\n”, “ input, = ctx.saved_tensors\n”, “ return grad_in * (grad_in * (input - input.clamp(ctx.min, ctx.max)) >= 0), None, None\n”, “\n”, “\n”, “clamp_with_grad = ClampWithGrad.apply\n”, “\n”, “\n”, “def vector_quantize(x, codebook):\n”, “ d = x.pow(2).sum(dim=-1, keepdim=True) + codebook.pow(2).sum(dim=1) - 2 * x @ codebook.T\n”, “ indices = d.argmin(-1)\n”, “ x_q = F.one_hot(indices, codebook.shape[0]).to(d.dtype) @ codebook\n”, “ return replace_grad(x_q, x)\n”, “\n”, “\n”, “class Prompt(nn.Module):\n”, “ def init(self, embed, weight=1., stop=float(‘-inf’)):\n”, “ super().init()\n”, “ self.register_buffer(‘embed’, embed)\n”, “ self.register_buffer(‘weight’, torch.as_tensor(weight))\n”, “ self.register_buffer(‘stop’, torch.as_tensor(stop))\n”, “\n”, “ def forward(self, input):\n”, “ input_normed = F.normalize(input.unsqueeze(1), dim=2)\n”, “ embed_normed = F.normalize(self.embed.unsqueeze(0), dim=2)\n”, “ dists = input_normed.sub(embed_normed).norm(dim=2).div(2).arcsin().pow(2).mul(2)\n”, “ dists = dists * self.weight.sign()\n”, “ return self.weight.abs() * replace_grad(dists, torch.maximum(dists, self.stop)).mean()\n”, “\n”, “\n”, “def parse_prompt(prompt):\n”, “ vals = prompt.rsplit(‘:’, 2)\n”, “ vals = vals + [’’, ‘1’, ‘-inf’][len(vals):]\n”, “ return vals[0], float(vals[1]), float(vals[2])\n”, “\n”, “\n”, “class MakeCutouts(nn.Module):\n”, “ def init(self, cut_size, cutn, cut_pow=1.):\n”, “ super().init()\n”, “ self.cut_size = cut_size\n”, “ self.cutn = cutn\n”, “ self.cut_pow = cut_pow\n”, “\n”, “ self.augs = nn.Sequential(\n”, “ # K.RandomHorizontalFlip(p=0.5),\n”, “ # K.RandomVerticalFlip(p=0.5),\n”, “ # K.RandomSolarize(0.01, 0.01, p=0.7),\n”, “ # K.RandomSharpness(0.3,p=0.4),\n”, “ # K.RandomResizedCrop(size=(self.cut_size,self.cut_size), scale=(0.1,1), ratio=(0.75,1.333), cropping_mode=’resample’, p=0.5),\n”, “ # K.RandomCrop(size=(self.cut_size,self.cut_size), p=0.5),\n”, “ K.RandomAffine(degrees=15, translate=0.1, p=0.7, padding_mode=’border’),\n”, “ K.RandomPerspective(0.7,p=0.7),\n”, “ K.ColorJitter(hue=0.1, saturation=0.1, p=0.7),\n”, “ K.RandomErasing((.1, .4), (.3, 1/.3), same_on_batch=True, p=0.7),\n”, “ \n”, “)\n”, “ self.noise_fac = 0.1\n”, “ self.av_pool = nn.AdaptiveAvgPool2d((self.cut_size, self.cut_size))\n”, “ self.max_pool = nn.AdaptiveMaxPool2d((self.cut_size, self.cut_size))\n”, “\n”, “ def forward(self, input):\n”, “ sideY, sideX = input.shape[2:4]\n”, “ max_size = min(sideX, sideY)\n”, “ min_size = min(sideX, sideY, self.cut_size)\n”, “ cutouts = []\n”, “ \n”, “ for _ in range(self.cutn):\n”, “\n”, “ # size = int(torch.rand([])self.cut_pow * (max_size - min_size) + min_size)\n”, “ # offsetx = torch.randint(0, sideX - size + 1, ())\n”, “ # offsety = torch.randint(0, sideY - size + 1, ())\n”, “ # cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size]\n”, “ # cutouts.append(resample(cutout, (self.cut_size, self.cut_size)))\n”, “\n”, “ # cutout = transforms.Resize(size=(self.cut_size, self.cut_size))(input)\n”, “ \n”, “ cutout = (self.av_pool(input) + self.max_pool(input))/2\n”, “ cutouts.append(cutout)\n”, “ batch = self.augs(torch.cat(cutouts, dim=0))\n”, “ if self.noise_fac:\n”, “ facs = batch.new_empty([self.cutn, 1, 1, 1]).uniform_(0, self.noise_fac)\n”, “ batch = batch + facs * torch.randn_like(batch)\n”, “ return batch\n”, “\n”, “\n”, “def load_vqgan_model(config_path, checkpoint_path):\n”, “ config = OmegaConf.load(config_path)\n”, “ if config.model.target == ‘taming.models.vqgan.VQModel’:\n”, “ model = vqgan.VQModel(config.model.params)\n”, “ model.eval().requires_grad(False)\n”, “ model.init_from_ckpt(checkpoint_path)\n”, “ elif config.model.target == ‘taming.models.vqgan.GumbelVQ’:\n”, “ model = vqgan.GumbelVQ(config.model.params)\n”, “ model.eval().requires_grad_(False)\n”, “ model.init_from_ckpt(checkpoint_path)\n”, “ elif config.model.target == ‘taming.models.cond_transformer.Net2NetTransformer’:\n”, “ parent_model = cond_transformer.Net2NetTransformer(config.model.params)\n”, “ parent_model.eval().requires_grad_(False)\n”, “ parent_model.init_from_ckpt(checkpoint_path)\n”, “ model = parent_model.first_stage_model\n”, “ else:\n”, “ raise ValueError(f’unknown model type: {config.model.target}’)\n”, “ del model.loss\n”, “ return model\n”, “\n”, “\n”, “def resize_image(image, out_size):\n”, “ ratio = image.size[0] / image.size[1]\n”, “ area = min(image.size[0] * image.size[1], out_size[0] * out_size[1])\n”, “ size = round((area * ratio)0.5), round((area / ratio)0.5)\n”, “ return image.resize(size, Image.LANCZOS)\n”, “\n” ], “execution_count”: null, “outputs”: [] }, { “cell_type”: “markdown”, “metadata”: { “id”: “1tthw0YaispD” }, “source”: [ “## Icon Background (Optional)\n”, “\n”, “A surprisingly effective trick to improve the generation quality of images if you have a specific outcome in mind to generate an icon to serve an initial image to start generation and/or an image to target during generation. You can select any of the free Font Awesome icons to use. Just click on an icon you want to get the icon_name such as fas fa-robot, then use that with next cell will generate an icon image to help steer the AI image generation.\n”, “\n”, “See this GitHub repository for more information on configuration.\n”, “\n” ] }, { “cell_type”: “code”, “metadata”: { “id”: “qxrUUDzpshPn”, “cellView”: “form” }, “source”: [ “icon_name = "fas fa-tv" #@param {type:"string"}\n”, “bg_width = 600 #@param {type:"integer"}\n”, “bg_height = 600 #@param {type:"integer"}\n”, “icon_size = 500 #@param {type:"integer"}\n”, “icon_color = "black" #@param {type:"string"}\n”, “bg_color = "white" #@param {type:"string"}\n”, “icon_opacity = 0.8 #@param {type:"slider", min:0, max:1, step:0.1}\n”, “bg_noise_opacity = 0.5 #@param {type:"slider", min:0, max:1, step:0.1}\n”, “align = "center" #@param ["center", "left", "right", "top", "bottom"]\n”, “\n”, “icon_config = {\n”, “ "icon_name": icon_name,\n”, “ "bg_width": bg_width,\n”, “ "bg_height": bg_height,\n”, “ "icon_size": icon_size,\n”, “ "icon_color": icon_color,\n”, “ "bg_color": bg_color,\n”, “ "icon_opacity": icon_opacity,\n”, “ "bg_noise_opacity": bg_noise_opacity,\n”, “ "align": align,\n”, “ "seed": 42\n”, “}\n”, “\n”, “try:\n”, “ for filename in [‘fa-brands-400.ttf’, ‘fa-regular-400.ttf’, ‘fa-solid-900.ttf’, ‘fontawesome.min.css’]:\n”, “ move(os.path.join("/content", ‘icon-image’, filename), os.path.join("/content", filename))\n”, “except FileNotFoundError:\n”, “ pass\n”, “\n”, “gen_icon(*icon_config)\n”, “display.display(display.Image(‘icon.png’))” ], “execution_count”: null, “outputs”: [] }, { “cell_type”: “markdown”, “metadata”: { “id”: “p0qN8T1EzPn7” }, “source”: [ “## AI Image Generation Settings\n”, “\n”, “The following cell allows you to set the training parameters for image generation:\n”, “\n”, “### Generation Settings\n”, “\n”, “- texts: The text prompt(s) you want the AI to generate an image from.\n”, “ - You can include multiple prompts by separating them with a |, and the AI will attempt to optimize for all prompts simultaneously, e.g. apple | painting of a calm sunset\n”, “ - You can apply a weight to each prompt by appending a :{weight} to each prompt, and the AI will attempt to favor prompts with a higher weight proportionally more, e.g. apple:3 | painting of a calm sunset\n”, “ - You can apply a negative weight to get the *opposite of what the text is, which can result in chaos. (in the case of a portrait of Elon Musk:3 | 3d rendering in unreal engine:-1, what is the opposite of a 3d rendering? Only one way to find out!)\n”, “\n”, “- width, height: Width and height of the image in pixels. Smaller images generate faster but are less detailed.\n”, “ - Going too high above the default 600x600px size may result in the GPU going out-of-memory.\n”, “ - For 4:3 images, I recommend 640x480; for 16:9 images, I recommend 640x360.\n”, “\n”, “- init_image: The initial image filename for starting the generation and finetuning. You can upload an image by opening the Colab Notebook sidebar, clicking the Folder icon, and uploading an image to the top level.\n”, “ - If not specified, generation will start with a solid color.\n”, “ - The image will be resized to the specified width/height.\n”, “ - init_image_icon will use the icon specified in the previous cell as the init_image.\n”, “\n”, “- target_images: The target image filename(s) for the generation to target. \n”, “ - You can use multiple images as noted in the texts section. It’s strongly recommended to tweak weights of both text prompts and image prompts if doing so.\n”, “ - target_image_icon will use the icon specified in the previous cell as the target_image.\n”, “\n”, “### Training Settings\n”, “\n”, “- learning_rate: Learning rate for the model which controls the speed in which the model optimizes for the prompts. If too high, model can diverge; if too low, model may not train.\n”, “ - ~0.2 is recommend if training without an init_image; ~0.1 is recommended if using one.\n”, “\n”, “- max_steps: Number of steps for training the model; the more steps, the better the generation.\n”, “\n”, “- images_interval: Number of steps for the training to check in and output an image of what is trained so far.\n” ] }, { “cell_type”: “code”, “metadata”: { “id”: “Pf8a78a2WKoU”, “cellView”: “form” }, “source”: [ “# Fixed parameters\n”, “icon_path = "icon.png"\n”, “model_name = "vqgan_imagenet_f16_16384"\n”, “seed = 42\n”, “\n”, “texts = "iZND powered by GPT-3 technology logo" #@param {type:"string"}\n”, “width = 600 #@param {type:"integer"}\n”, “height = 600 #@param {type:"integer"}\n”, “init_image = "" #@param {type:"string"}\n”, “init_image_icon = False #@param {type:"boolean"}\n”, “if init_image_icon:\n”, “ assert os.path.exists(icon_path), "No icon has been generated from the previous cell"\n”, “ init_image = icon_path\n”, “\n”, “target_images = "" #@param {type:"string"}\n”, “target_image_icon = False #@param {type:"boolean"}\n”, “if target_image_icon:\n”, “ assert os.path.exists(icon_path), "No icon has been generated from the previous cell"\n”, “ target_images = icon_path\n”, “\n”, “#@markdown —\n”, “learning_rate = 0.24 #@param {type:"slider", min:0.00, max:0.30, step:0.01}\n”, “max_steps = 400#@param {type:"integer"}\n”, “images_interval = 100#@param {type:"integer"}\n”, “\n”, “gen_config = {\n”, “ "texts": texts,\n”, “ "width": width,\n”, “ "height": height,\n”, “ "init_image": "\" if init_image_icon else init_image,\n", " \"target_images\": \"\" if target_image_icon else target_images,\n", " \"learning_rate\": learning_rate,\n", " \"max_steps\": max_steps,\n", " \"training_seed\": 42,\n", " \"model\": \"vqgan_imagenet_f16_16384\"\n", "}" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "tZqnjHq6HyDp" }, "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ZdlpRFL8UAlW", "cellView": "form" }, "source": [ "#@title Start AI Image Generation!\n", "\n", "!rm -rf steps\n", "!mkdir steps\n", "\n", "metadata = PngInfo()\n", "for k, v in gen_config.items():\n", " try:\n", " metadata.add_text(\"AI_ \" + k, str(v))\n", " except UnicodeEncodeError:\n", " pass\n", "\n", "if init_image_icon or target_image_icon:\n", " for k, v in icon_config.items():\n", " try:\n", " metadata.add_text(\"AI_Icon_ \" + k, str(v))\n", " except UnicodeEncodeError:\n", " pass\n", "\n", "model_names={\"vqgan_imagenet_f16_16384\": 'ImageNet 16384',\"vqgan_imagenet_f16_1024\":\"ImageNet 1024\", 'vqgan_openimages_f16_8192':'OpenImages 8912',\n", " \"wikiart_1024\":\"WikiArt 1024\", \"wikiart_16384\":\"WikiArt 16384\", \"coco\":\"COCO-Stuff\", \"faceshq\":\"FacesHQ\", \"sflckr\":\"S-FLCKR\"}\n", "name_model = model_names[model_name] \n", "\n", "if seed == -1:\n", " seed = None\n", "if init_image == \"None\":\n", " init_image = None\n", "if target_images == \"None\" or not target_images:\n", " model_target_images = []\n", "else:\n", " model_target_images = target_images.split(\"|\")\n", " model_target_images = [image.strip() for image in model_target_images]\n", "\n", "model_texts = [phrase.strip() for phrase in texts.split(\"|\")]\n", "if model_texts == ['']:\n", " model_texts = []\n", "\n", "\n", "args = argparse.Namespace(\n", " prompts=model_texts,\n", " image_prompts=model_target_images,\n", " noise_prompt_seeds=[],\n", " noise_prompt_weights=[],\n", " size=[width, height],\n", " init_image=init_image,\n", " init_weight=0.,\n", " clip_model='ViT-B/32',\n", " vqgan_config=f'{model_name}.yaml',\n", " vqgan_checkpoint=f'{model_name}.ckpt',\n", " step_size=learning_rate,\n", " cutn=32,\n", " cut_pow=1.,\n", " display_freq=images_interval,\n", " seed=seed,\n", ")\n", "from urllib.request import urlopen\n", "\n", "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n", "print('Using device:', device)\n", "if model_texts:\n", " print('Using texts:', model_texts)\n", "if model_target_images:\n", " print('Using image prompts:', model_target_images)\n", "if args.seed is None:\n", " seed = torch.seed()\n", "else:\n", " seed = args.seed\n", "torch.manual_seed(seed)\n", "print('Using seed:', seed)\n", "\n", "model = load_vqgan_model(args.vqgan_config, args.vqgan_checkpoint).to(device)\n", "perceptor = clip.load(args.clip_model, jit=False)[0].eval().requires_grad_(False).to(device)\n", "# clock=deepcopy(perceptor.visual.positional_embedding.data)\n", "# perceptor.visual.positional_embedding.data = clock/clock.max()\n", "# perceptor.visual.positional_embedding.data=clamp_with_grad(clock,0,1)\n", "\n", "cut_size = perceptor.visual.input_resolution\n", "\n", "f = 2**(model.decoder.num_resolutions - 1)\n", "make_cutouts = MakeCutouts(cut_size, args.cutn, cut_pow=args.cut_pow)\n", "\n", "toksX, toksY = args.size[0] // f, args.size[1] // f\n", "sideX, sideY = toksX * f, toksY * f\n", "\n", "if args.vqgan_checkpoint == 'vqgan_openimages_f16_8192.ckpt':\n", " e_dim = 256\n", " n_toks = model.quantize.n_embed\n", " z_min = model.quantize.embed.weight.min(dim=0).values[None, :, None, None]\n", " z_max = model.quantize.embed.weight.max(dim=0).values[None, :, None, None]\n", "else:\n", " e_dim = model.quantize.e_dim\n", " n_toks = model.quantize.n_e\n", " z_min = model.quantize.embedding.weight.min(dim=0).values[None, :, None, None]\n", " z_max = model.quantize.embedding.weight.max(dim=0).values[None, :, None, None]\n", "# z_min = model.quantize.embedding.weight.min(dim=0).values[None, :, None, None]\n", "# z_max = model.quantize.embedding.weight.max(dim=0).values[None, :, None, None]\n", "\n", "# normalize_imagenet = transforms.Normalize(mean=[0.485, 0.456, 0.406],\n", "# std=[0.229, 0.224, 0.225])\n", "\n", "if args.init_image:\n", " if 'http' in args.init_image:\n", " img = Image.open(urlopen(args.init_image))\n", " else:\n", " img = Image.open(args.init_image)\n", " pil_image = img.convert('RGB')\n", " if pil_image.size != (width, height):\n", " print(f\"Resizing source image to {width}x{height}\")\n", " pil_image = pil_image.resize((sideX, sideY), Image.LANCZOS)\n", " pil_tensor = TF.to_tensor(pil_image)\n", " z, *_ = model.encode(pil_tensor.to(device).unsqueeze(0) * 2 - 1)\n", "else:\n", " one_hot = F.one_hot(torch.randint(n_toks, [toksY * toksX], device=device), n_toks).float()\n", " # z = one_hot @ model.quantize.embedding.weight\n", " if args.vqgan_checkpoint == 'vqgan_openimages_f16_8192.ckpt':\n", " z = one_hot @ model.quantize.embed.weight\n", " else:\n", " z = one_hot @ model.quantize.embedding.weight\n", " z = z.view([-1, toksY, toksX, e_dim]).permute(0, 3, 1, 2) \n", " z = torch.rand_like(z)*2\n", "z_orig = z.clone()\n", "z.requires_grad_(True)\n", "opt = optim.Adam([z], lr=args.step_size)\n", "scheduler = StepLR(opt, step_size=5, gamma=0.95)\n", "\n", "normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],\n", " std=[0.26862954, 0.26130258, 0.27577711])\n", "\n", "\n", "\n", "pMs = []\n", "\n", "for prompt in args.prompts:\n", " txt, weight, stop = parse_prompt(prompt)\n", " embed = perceptor.encode_text(clip.tokenize(txt).to(device)).float()\n", " pMs.append(Prompt(embed, weight, stop).to(device))\n", "\n", "for prompt in args.image_prompts:\n", " path, weight, stop = parse_prompt(prompt)\n", " img = Image.open(path)\n", " pil_image = img.convert('RGB')\n", " img = resize_image(pil_image, (sideX, sideY))\n", " batch = make_cutouts(TF.to_tensor(img).unsqueeze(0).to(device))\n", " embed = perceptor.encode_image(normalize(batch)).float()\n", " pMs.append(Prompt(embed, weight, stop).to(device))\n", "\n", "for seed, weight in zip(args.noise_prompt_seeds, args.noise_prompt_weights):\n", " gen = torch.Generator().manual_seed(seed)\n", " embed = torch.empty([1, perceptor.visual.output_dim]).normal_(generator=gen)\n", " pMs.append(Prompt(embed, weight).to(device))\n", "\n", "def synth(z):\n", " if args.vqgan_checkpoint == 'vqgan_openimages_f16_8192.ckpt':\n", " z_q = vector_quantize(z.movedim(1, 3), model.quantize.embed.weight).movedim(3, 1)\n", " else:\n", " z_q = vector_quantize(z.movedim(1, 3), model.quantize.embedding.weight).movedim(3, 1)\n", " return clamp_with_grad(model.decode(z_q).add(1).div(2), 0, 1)\n", "\n", "@torch.no_grad()\n", "def checkin(i, losses):\n", " losses_str = ', '.join(f'{loss.item():g}' for loss in losses)\n", " tqdm.write(f'i: {i}, loss: {sum(losses).item():g}, losses: {losses_str}')\n", " out = synth(z)\n", " TF.to_pil_image(out[0].cpu()).save('progress.png', pnginfo=metadata)\n", " display.display(display.Image('progress.png'))\n", "\n", "def ascend_txt():\n", " # global i\n", " out = synth(z)\n", " iii = perceptor.encode_image(normalize(make_cutouts(out))).float()\n", " \n", " result = []\n", "\n", " if args.init_weight:\n", " # result.append(F.mse_loss(z, z_orig) * args.init_weight / 2)\n", " result.append(F.mse_loss(z, torch.zeros_like(z_orig)) * ((1/torch.tensor(i*2 + 1))*args.init_weight) / 2)\n", " for prompt in pMs:\n", " result.append(prompt(iii))\n", " img = np.array(out.mul(255).clamp(0, 255)[0].cpu().detach().numpy().astype(np.uint8))[:,:,:]\n", " img = np.transpose(img, (1, 2, 0))\n", " img = Image.fromarray(img)\n", " # imageio.imwrite(f'./steps/{i:03d}.png', np.array(img))\n", "\n", " img.save(f\"./steps/{i:03d}.png\", pnginfo=metadata)\n", " return result\n", "\n", "def train(i):\n", " opt.zero_grad()\n", " lossAll = ascend_txt()\n", " if i % args.display_freq == 0:\n", " checkin(i, lossAll)\n", " \n", " loss = sum(lossAll)\n", " loss.backward()\n", " opt.step()\n", " scheduler.step()\n", " with torch.no_grad():\n", " z.copy_(z.maximum(z_min).minimum(z_max))\n", "\n", "try:\n", " for i in tqdm(range(max_steps)):\n", " train(i)\n", " checkin(max_steps, ascend_txt())\n", "except KeyboardInterrupt:\n", " pass\n", "\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "2bZ-Lt8o3R2D" }, "source": [ "You can right-click the final image and Save As to save it locally, Copy and Paste it directly to another application/social media site, or go into the `/steps/` folder to view all genenerate images for each step.\n", "\n", "If you do use these images, please note that they were created by VQGAN + CLIP and/or provide a link to this Notebook so others can make their own images too!" ] }, { "cell_type": "markdown", "metadata": { "id": "Fq3tv3VUwkA3" }, "source": [ "## Generate Video\n", "\n", "You can generate and download a video of the AI generation you just did by running the following cell!" ] }, { "cell_type": "code", "metadata": { "cellView": "form", "id": "hEkgIdoWwr8Z" }, "source": [ "frame_rate = 30 #@param {type:\"number\"}\n", "\n", "print(\"Rendering Video...\")\n", "result = os.system(f\"ffmpeg -y -r {frame_rate} -i /content/steps/%03d.png -c:v libx264 -vf fps={frame_rate} -pix_fmt yuv420p /content/vqgan_clip.mp4\")\n", "print(\"Video saved at vqgan_clip.mp4!\")\n", "\n", "from google.colab import files\n", "files.download('/content/vqgan_clip.mp4')" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# New Section" ], "metadata": { "id": "cg-_BN225_3e" } }, { "cell_type": "markdown", "metadata": { "id": "R7LwGXVwxoS9" }, "source": [ "## Notes / Helpful Tips\n", "\n", "- You can further constrain image generation to follow an icon shape, which can have [incredible](https://twitter.com/minimaxir/status/1423836227409567747) [results](https://twitter.com/minimaxir/status/1423800629248479237). The trick:\n", " - Use an icon image for **both** `init_image` and `target_images`, ideally with an `icon_opacity` less than 1.0.\n", " - Apply a very high weight to the `texts` prompt, e.g. `reality is an illusion:8`. Decrease the weight iteratively to allow the generation to follow the icon shape better.\n", "- This notebook forces the use of the ImageNet 16384 VQGAN as that generates the best images for the vast majority of use cases (exceptions are images with sharp shapes, such as text, pixel art, and anime). If more research into alternate VQGANs continues, then a selector may be added.\n", "- The training uses a slight learning rate decay (multiply LR by 95% every 5 steps) to avoid destablization when further in the training.\n", "- The config parameters are stored as PNG tEXt metadata within each generated image, so you can retrieve the configuration used for each generated image if necessary using a tool like `exiftool` or https://exif.tools/. This metadata is stripped when an image is uploaded to social media." ] }, { "cell_type": "markdown", "metadata": { "id": "wdOwQoYvmaKl" }, "source": [ "## License\n", "\n", "MIT License\n", "\n", "Copyright (c) 2021 Max Woolf\n", "\n", "Permission is hereby granted, free of charge, to any person obtaining a copy\n", "of this software and associated documentation files (the \"Software\"), to deal\n", "in the Software without restriction, including without limitation the rights\n", "to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n", "copies of the Software, and to permit persons to whom the Software is\n", "furnished to do so, subject to the following conditions:\n", "\n", "The above copyright notice and this permission notice shall be included in all\n", "copies or substantial portions of the Software.\n", "\n", "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", "IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", "FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n", "AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", "LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n", "OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n", "SOFTWARE.\n", "\n", "---\n", "\n", "Copyright (c) 2021 Katherine Crowson\n", "\n", "Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:\n", "\n", "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\n", "\n", "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", "IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", "FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n", "AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n", "OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE." "\n", "---\n", "\n", "Copyright (c) 2021 Dr. Isaac Govind Andy\n", "\n", "Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:\n", "\n", "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\n", "\n", "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", "IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", "FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n", "AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n", "OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE." ] } ] }