Init
2
.gitattributes
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
# Auto detect text files and perform LF normalization
|
||||||
|
* text=auto
|
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
|
||||||
|
*.pyc
|
BIN
AnyText/.DS_Store
vendored
Normal file
421
AnyText/AnyText_scripts/AnyText_bert_tokenizer.py
Normal file
@ -0,0 +1,421 @@
|
|||||||
|
# Copyright 2018 The Google AI Language Team Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Tokenization classes."""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
import collections
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
|
def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
|
||||||
|
"""Checks whether the casing config is consistent with the checkpoint name."""
|
||||||
|
|
||||||
|
# The casing has to be passed in by the user and there is no explicit check
|
||||||
|
# as to whether it matches the checkpoint. The casing information probably
|
||||||
|
# should have been stored in the bert_config.json file, but it's not, so
|
||||||
|
# we have to heuristically detect it to validate.
|
||||||
|
|
||||||
|
if not init_checkpoint:
|
||||||
|
return
|
||||||
|
|
||||||
|
m = re.match('^.*?([A-Za-z0-9_-]+)/bert_model.ckpt', init_checkpoint)
|
||||||
|
if m is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
model_name = m.group(1)
|
||||||
|
|
||||||
|
lower_models = [
|
||||||
|
'uncased_L-24_H-1024_A-16', 'uncased_L-12_H-768_A-12',
|
||||||
|
'multilingual_L-12_H-768_A-12', 'chinese_L-12_H-768_A-12'
|
||||||
|
]
|
||||||
|
|
||||||
|
cased_models = [
|
||||||
|
'cased_L-12_H-768_A-12', 'cased_L-24_H-1024_A-16',
|
||||||
|
'multi_cased_L-12_H-768_A-12'
|
||||||
|
]
|
||||||
|
|
||||||
|
is_bad_config = False
|
||||||
|
if model_name in lower_models and not do_lower_case:
|
||||||
|
is_bad_config = True
|
||||||
|
actual_flag = 'False'
|
||||||
|
case_name = 'lowercased'
|
||||||
|
opposite_flag = 'True'
|
||||||
|
|
||||||
|
if model_name in cased_models and do_lower_case:
|
||||||
|
is_bad_config = True
|
||||||
|
actual_flag = 'True'
|
||||||
|
case_name = 'cased'
|
||||||
|
opposite_flag = 'False'
|
||||||
|
|
||||||
|
if is_bad_config:
|
||||||
|
raise ValueError(
|
||||||
|
'You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. '
|
||||||
|
'However, `%s` seems to be a %s model, so you '
|
||||||
|
'should pass in `--do_lower_case=%s` so that the fine-tuning matches '
|
||||||
|
'how the model was pre-training. If this error is wrong, please '
|
||||||
|
'just comment out this check.' %
|
||||||
|
(actual_flag, init_checkpoint, model_name, case_name,
|
||||||
|
opposite_flag))
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_unicode(text):
|
||||||
|
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
|
||||||
|
if six.PY3:
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text
|
||||||
|
elif isinstance(text, bytes):
|
||||||
|
return text.decode('utf-8', 'ignore')
|
||||||
|
else:
|
||||||
|
raise ValueError('Unsupported string type: %s' % (type(text)))
|
||||||
|
elif six.PY2:
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text.decode('utf-8', 'ignore')
|
||||||
|
elif isinstance(text, unicode):
|
||||||
|
return text
|
||||||
|
else:
|
||||||
|
raise ValueError('Unsupported string type: %s' % (type(text)))
|
||||||
|
else:
|
||||||
|
raise ValueError('Not running on Python2 or Python 3?')
|
||||||
|
|
||||||
|
|
||||||
|
def printable_text(text):
|
||||||
|
"""Returns text encoded in a way suitable for print or `tf.logging`."""
|
||||||
|
|
||||||
|
# These functions want `str` for both Python2 and Python3, but in one case
|
||||||
|
# it's a Unicode string and in the other it's a byte string.
|
||||||
|
if six.PY3:
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text
|
||||||
|
elif isinstance(text, bytes):
|
||||||
|
return text.decode('utf-8', 'ignore')
|
||||||
|
else:
|
||||||
|
raise ValueError('Unsupported string type: %s' % (type(text)))
|
||||||
|
elif six.PY2:
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text
|
||||||
|
elif isinstance(text, unicode):
|
||||||
|
return text.encode('utf-8')
|
||||||
|
else:
|
||||||
|
raise ValueError('Unsupported string type: %s' % (type(text)))
|
||||||
|
else:
|
||||||
|
raise ValueError('Not running on Python2 or Python 3?')
|
||||||
|
|
||||||
|
|
||||||
|
def load_vocab(vocab_file):
|
||||||
|
"""Loads a vocabulary file into a dictionary."""
|
||||||
|
vocab = collections.OrderedDict()
|
||||||
|
index = 0
|
||||||
|
with open(vocab_file, 'r', encoding='utf-8') as reader:
|
||||||
|
while True:
|
||||||
|
token = convert_to_unicode(reader.readline())
|
||||||
|
if not token:
|
||||||
|
break
|
||||||
|
token = token.strip()
|
||||||
|
vocab[token] = index
|
||||||
|
index += 1
|
||||||
|
return vocab
|
||||||
|
|
||||||
|
|
||||||
|
def convert_by_vocab(vocab, items):
|
||||||
|
"""Converts a sequence of [tokens|ids] using the vocab."""
|
||||||
|
output = []
|
||||||
|
for item in items:
|
||||||
|
output.append(vocab[item])
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def convert_tokens_to_ids(vocab, tokens):
|
||||||
|
return convert_by_vocab(vocab, tokens)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_ids_to_tokens(inv_vocab, ids):
|
||||||
|
return convert_by_vocab(inv_vocab, ids)
|
||||||
|
|
||||||
|
|
||||||
|
def whitespace_tokenize(text):
|
||||||
|
"""Runs basic whitespace cleaning and splitting on a piece of text."""
|
||||||
|
text = text.strip()
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
tokens = text.split()
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
class FullTokenizer(object):
|
||||||
|
"""Runs end-to-end tokenization."""
|
||||||
|
|
||||||
|
def __init__(self, vocab_file, do_lower_case=True):
|
||||||
|
self.vocab = load_vocab(vocab_file)
|
||||||
|
self.inv_vocab = {v: k for k, v in self.vocab.items()}
|
||||||
|
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
|
||||||
|
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
|
||||||
|
|
||||||
|
def tokenize(self, text):
|
||||||
|
split_tokens = []
|
||||||
|
for token in self.basic_tokenizer.tokenize(text):
|
||||||
|
for sub_token in self.wordpiece_tokenizer.tokenize(token):
|
||||||
|
split_tokens.append(sub_token)
|
||||||
|
|
||||||
|
return split_tokens
|
||||||
|
|
||||||
|
def convert_tokens_to_ids(self, tokens):
|
||||||
|
return convert_by_vocab(self.vocab, tokens)
|
||||||
|
|
||||||
|
def convert_ids_to_tokens(self, ids):
|
||||||
|
return convert_by_vocab(self.inv_vocab, ids)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True):
|
||||||
|
""" Converts a sequence of tokens (string) in a single string. """
|
||||||
|
|
||||||
|
def clean_up_tokenization(out_string):
|
||||||
|
""" Clean up a list of simple English tokenization artifacts
|
||||||
|
like spaces before punctuations and abreviated forms.
|
||||||
|
"""
|
||||||
|
out_string = (
|
||||||
|
out_string.replace(' .', '.').replace(' ?', '?').replace(
|
||||||
|
' !', '!').replace(' ,', ',').replace(" ' ", "'").replace(
|
||||||
|
" n't", "n't").replace(" 'm", "'m").replace(
|
||||||
|
" 's", "'s").replace(" 've",
|
||||||
|
"'ve").replace(" 're", "'re"))
|
||||||
|
return out_string
|
||||||
|
|
||||||
|
text = ' '.join(tokens).replace(' ##', '').strip()
|
||||||
|
if clean_up_tokenization_spaces:
|
||||||
|
clean_text = clean_up_tokenization(text)
|
||||||
|
return clean_text
|
||||||
|
else:
|
||||||
|
return text
|
||||||
|
|
||||||
|
def vocab_size(self):
|
||||||
|
return len(self.vocab)
|
||||||
|
|
||||||
|
|
||||||
|
class BasicTokenizer(object):
|
||||||
|
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
|
||||||
|
|
||||||
|
def __init__(self, do_lower_case=True):
|
||||||
|
"""Constructs a BasicTokenizer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
do_lower_case: Whether to lower case the input.
|
||||||
|
"""
|
||||||
|
self.do_lower_case = do_lower_case
|
||||||
|
|
||||||
|
def tokenize(self, text):
|
||||||
|
"""Tokenizes a piece of text."""
|
||||||
|
text = convert_to_unicode(text)
|
||||||
|
text = self._clean_text(text)
|
||||||
|
|
||||||
|
# This was added on November 1st, 2018 for the multilingual and Chinese
|
||||||
|
# models. This is also applied to the English models now, but it doesn't
|
||||||
|
# matter since the English models were not trained on any Chinese data
|
||||||
|
# and generally don't have any Chinese data in them (there are Chinese
|
||||||
|
# characters in the vocabulary because Wikipedia does have some Chinese
|
||||||
|
# words in the English Wikipedia.).
|
||||||
|
text = self._tokenize_chinese_chars(text)
|
||||||
|
|
||||||
|
orig_tokens = whitespace_tokenize(text)
|
||||||
|
split_tokens = []
|
||||||
|
for token in orig_tokens:
|
||||||
|
if self.do_lower_case:
|
||||||
|
token = token.lower()
|
||||||
|
token = self._run_strip_accents(token)
|
||||||
|
split_tokens.extend(self._run_split_on_punc(token))
|
||||||
|
|
||||||
|
output_tokens = whitespace_tokenize(' '.join(split_tokens))
|
||||||
|
return output_tokens
|
||||||
|
|
||||||
|
def _run_strip_accents(self, text):
|
||||||
|
"""Strips accents from a piece of text."""
|
||||||
|
text = unicodedata.normalize('NFD', text)
|
||||||
|
output = []
|
||||||
|
for char in text:
|
||||||
|
cat = unicodedata.category(char)
|
||||||
|
if cat == 'Mn':
|
||||||
|
continue
|
||||||
|
output.append(char)
|
||||||
|
return ''.join(output)
|
||||||
|
|
||||||
|
def _run_split_on_punc(self, text):
|
||||||
|
"""Splits punctuation on a piece of text."""
|
||||||
|
chars = list(text)
|
||||||
|
i = 0
|
||||||
|
start_new_word = True
|
||||||
|
output = []
|
||||||
|
while i < len(chars):
|
||||||
|
char = chars[i]
|
||||||
|
if _is_punctuation(char):
|
||||||
|
output.append([char])
|
||||||
|
start_new_word = True
|
||||||
|
else:
|
||||||
|
if start_new_word:
|
||||||
|
output.append([])
|
||||||
|
start_new_word = False
|
||||||
|
output[-1].append(char)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return [''.join(x) for x in output]
|
||||||
|
|
||||||
|
def _tokenize_chinese_chars(self, text):
|
||||||
|
"""Adds whitespace around any CJK character."""
|
||||||
|
output = []
|
||||||
|
for char in text:
|
||||||
|
cp = ord(char)
|
||||||
|
if self._is_chinese_char(cp):
|
||||||
|
output.append(' ')
|
||||||
|
output.append(char)
|
||||||
|
output.append(' ')
|
||||||
|
else:
|
||||||
|
output.append(char)
|
||||||
|
return ''.join(output)
|
||||||
|
|
||||||
|
def _is_chinese_char(self, cp):
|
||||||
|
"""Checks whether CP is the codepoint of a CJK character."""
|
||||||
|
# This defines a "chinese character" as anything in the CJK Unicode block:
|
||||||
|
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
|
||||||
|
#
|
||||||
|
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
|
||||||
|
# despite its name. The modern Korean Hangul alphabet is a different block,
|
||||||
|
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
|
||||||
|
# space-separated words, so they are not treated specially and handled
|
||||||
|
# like the all of the other languages.
|
||||||
|
if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
|
||||||
|
or (cp >= 0x20000 and cp <= 0x2A6DF)
|
||||||
|
or (cp >= 0x2A700 and cp <= 0x2B73F)
|
||||||
|
or (cp >= 0x2B740 and cp <= 0x2B81F)
|
||||||
|
or (cp >= 0x2B820 and cp <= 0x2CEAF)
|
||||||
|
or (cp >= 0xF900 and cp <= 0xFAFF)
|
||||||
|
or (cp >= 0x2F800 and cp <= 0x2FA1F)):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _clean_text(self, text):
|
||||||
|
"""Performs invalid character removal and whitespace cleanup on text."""
|
||||||
|
output = []
|
||||||
|
for char in text:
|
||||||
|
cp = ord(char)
|
||||||
|
if cp == 0 or cp == 0xfffd or _is_control(char):
|
||||||
|
continue
|
||||||
|
if _is_whitespace(char):
|
||||||
|
output.append(' ')
|
||||||
|
else:
|
||||||
|
output.append(char)
|
||||||
|
return ''.join(output)
|
||||||
|
|
||||||
|
|
||||||
|
class WordpieceTokenizer(object):
|
||||||
|
"""Runs WordPiece tokenization."""
|
||||||
|
|
||||||
|
def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=200):
|
||||||
|
self.vocab = vocab
|
||||||
|
self.unk_token = unk_token
|
||||||
|
self.max_input_chars_per_word = max_input_chars_per_word
|
||||||
|
|
||||||
|
def tokenize(self, text):
|
||||||
|
"""Tokenizes a piece of text into its word pieces.
|
||||||
|
|
||||||
|
This uses a greedy longest-match-first algorithm to perform tokenization
|
||||||
|
using the given vocabulary.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
input = "unaffable"
|
||||||
|
output = ["un", "##aff", "##able"]
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: A single token or whitespace separated tokens. This should have
|
||||||
|
already been passed through `BasicTokenizer.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of wordpiece tokens.
|
||||||
|
"""
|
||||||
|
|
||||||
|
text = convert_to_unicode(text)
|
||||||
|
|
||||||
|
output_tokens = []
|
||||||
|
for token in whitespace_tokenize(text):
|
||||||
|
chars = list(token)
|
||||||
|
if len(chars) > self.max_input_chars_per_word:
|
||||||
|
output_tokens.append(self.unk_token)
|
||||||
|
continue
|
||||||
|
|
||||||
|
is_bad = False
|
||||||
|
start = 0
|
||||||
|
sub_tokens = []
|
||||||
|
while start < len(chars):
|
||||||
|
end = len(chars)
|
||||||
|
cur_substr = None
|
||||||
|
while start < end:
|
||||||
|
substr = ''.join(chars[start:end])
|
||||||
|
if start > 0:
|
||||||
|
substr = '##' + substr
|
||||||
|
if substr in self.vocab:
|
||||||
|
cur_substr = substr
|
||||||
|
break
|
||||||
|
end -= 1
|
||||||
|
if cur_substr is None:
|
||||||
|
is_bad = True
|
||||||
|
break
|
||||||
|
sub_tokens.append(cur_substr)
|
||||||
|
start = end
|
||||||
|
|
||||||
|
if is_bad:
|
||||||
|
output_tokens.append(self.unk_token)
|
||||||
|
else:
|
||||||
|
output_tokens.extend(sub_tokens)
|
||||||
|
return output_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def _is_whitespace(char):
|
||||||
|
"""Checks whether `chars` is a whitespace character."""
|
||||||
|
# \t, \n, and \r are technically contorl characters but we treat them
|
||||||
|
# as whitespace since they are generally considered as such.
|
||||||
|
if char == ' ' or char == '\t' or char == '\n' or char == '\r':
|
||||||
|
return True
|
||||||
|
cat = unicodedata.category(char)
|
||||||
|
if cat == 'Zs':
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _is_control(char):
|
||||||
|
"""Checks whether `chars` is a control character."""
|
||||||
|
# These are technically control characters but we count them as whitespace
|
||||||
|
# characters.
|
||||||
|
if char == '\t' or char == '\n' or char == '\r':
|
||||||
|
return False
|
||||||
|
cat = unicodedata.category(char)
|
||||||
|
if cat in ('Cc', 'Cf'):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _is_punctuation(char):
|
||||||
|
"""Checks whether `chars` is a punctuation character."""
|
||||||
|
cp = ord(char)
|
||||||
|
# We treat all non-letter/number ASCII as punctuation.
|
||||||
|
# Characters such as "^", "$", and "`" are not in the Unicode
|
||||||
|
# Punctuation class but we treat them as punctuation anyways, for
|
||||||
|
# consistency.
|
||||||
|
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
|
||||||
|
or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
|
||||||
|
return True
|
||||||
|
cat = unicodedata.category(char)
|
||||||
|
if cat.startswith('P'):
|
||||||
|
return True
|
||||||
|
return False
|
77
AnyText/AnyText_scripts/AnyText_dataset_util.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import ujson
|
||||||
|
import json
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
__all__ = ['load', 'save', 'show_bbox_on_image']
|
||||||
|
|
||||||
|
|
||||||
|
def load(file_path: str):
|
||||||
|
file_path = pathlib.Path(file_path)
|
||||||
|
func_dict = {'.txt': load_txt, '.json': load_json, '.list': load_txt}
|
||||||
|
assert file_path.suffix in func_dict
|
||||||
|
return func_dict[file_path.suffix](file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def load_txt(file_path: str):
|
||||||
|
with open(file_path, 'r', encoding='utf8') as f:
|
||||||
|
content = [x.strip().strip('\ufeff').strip('\xef\xbb\xbf') for x in f.readlines()]
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def load_json(file_path: str):
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
content = f.read()
|
||||||
|
return ujson.loads(content)
|
||||||
|
|
||||||
|
|
||||||
|
def save(data, file_path):
|
||||||
|
file_path = pathlib.Path(file_path)
|
||||||
|
func_dict = {'.txt': save_txt, '.json': save_json}
|
||||||
|
assert file_path.suffix in func_dict
|
||||||
|
return func_dict[file_path.suffix](data, file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def save_txt(data, file_path):
|
||||||
|
if not isinstance(data, list):
|
||||||
|
data = [data]
|
||||||
|
with open(file_path, mode='w', encoding='utf8') as f:
|
||||||
|
f.write('\n'.join(data))
|
||||||
|
|
||||||
|
|
||||||
|
def save_json(data, file_path):
|
||||||
|
with open(file_path, 'w', encoding='utf-8') as json_file:
|
||||||
|
json.dump(data, json_file, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
|
||||||
|
def show_bbox_on_image(image, polygons=None, txt=None, color=None, font_path='./font/Arial_Unicode.ttf'):
|
||||||
|
from PIL import ImageDraw, ImageFont
|
||||||
|
image = image.convert('RGB')
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
if len(txt) == 0:
|
||||||
|
txt = None
|
||||||
|
if color is None:
|
||||||
|
color = (255, 0, 0)
|
||||||
|
if txt is not None:
|
||||||
|
font = ImageFont.truetype(font_path, 20)
|
||||||
|
for i, box in enumerate(polygons):
|
||||||
|
box = box[0]
|
||||||
|
if txt is not None:
|
||||||
|
draw.text((int(box[0][0]) + 20, int(box[0][1]) - 20), str(txt[i]), fill='red', font=font)
|
||||||
|
for j in range(len(box) - 1):
|
||||||
|
draw.line((box[j][0], box[j][1], box[j + 1][0], box[j + 1][1]), fill=color, width=2)
|
||||||
|
draw.line((box[-1][0], box[-1][1], box[0][0], box[0][1]), fill=color, width=2)
|
||||||
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
def show_glyphs(glyphs, name):
|
||||||
|
import numpy as np
|
||||||
|
import cv2
|
||||||
|
size = 64
|
||||||
|
gap = 5
|
||||||
|
n_char = 20
|
||||||
|
canvas = np.ones((size, size*n_char + gap*(n_char-1), 1))*0.5
|
||||||
|
x = 0
|
||||||
|
for i in range(glyphs.shape[-1]):
|
||||||
|
canvas[:, x:x + size, :] = glyphs[..., i:i+1]
|
||||||
|
x += size+gap
|
||||||
|
cv2.imwrite(name, canvas*255)
|
365
AnyText/AnyText_scripts/AnyText_pipeline.py
Normal file
@ -0,0 +1,365 @@
|
|||||||
|
import os
|
||||||
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
||||||
|
import torch
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
import cv2
|
||||||
|
import einops
|
||||||
|
import time
|
||||||
|
from PIL import ImageFont
|
||||||
|
from .cldm.model import create_model, load_state_dict
|
||||||
|
from .cldm.ddim_hacked import DDIMSampler
|
||||||
|
from .AnyText_t3_dataset import draw_glyph, draw_glyph2
|
||||||
|
from .AnyText_pipeline_util import check_channels, resize_image
|
||||||
|
from pytorch_lightning import seed_everything
|
||||||
|
from .AnyText_bert_tokenizer import BasicTokenizer
|
||||||
|
import folder_paths
|
||||||
|
from huggingface_hub import hf_hub_download
|
||||||
|
from ..utils import is_module_imported, t5_translate_en_ru_zh
|
||||||
|
|
||||||
|
checker = BasicTokenizer()
|
||||||
|
BBOX_MAX_NUM = 8
|
||||||
|
PLACE_HOLDER = '*'
|
||||||
|
max_chars = 20
|
||||||
|
|
||||||
|
comfyui_models_dir = folder_paths.models_dir
|
||||||
|
|
||||||
|
class AnyText_Pipeline():
|
||||||
|
def __init__(self, ckpt_path, clip_path, translator_path, cfg_path, use_translator, device, use_fp16, all_to_device, loaded_model_tensor):
|
||||||
|
self.device = device
|
||||||
|
self.use_fp16 = use_fp16
|
||||||
|
self.translator_path = translator_path
|
||||||
|
self.cfg_path = cfg_path
|
||||||
|
if ckpt_path != 'None':
|
||||||
|
ckpt_path = ckpt_path
|
||||||
|
else:
|
||||||
|
if os.access(os.path.join(comfyui_models_dir, "checkpoints", "15", "anytext_v1.1.safetensors"), os.F_OK):
|
||||||
|
ckpt_path = os.path.join(comfyui_models_dir, "checkpoints", "15", "anytext_v1.1.safetensors")
|
||||||
|
else:
|
||||||
|
hf_hub_download(repo_id="Sanster/AnyText", filename="pytorch_model.fp16.safetensors",local_dir=os.path.join(comfyui_models_dir, "checkpoints", "15"))
|
||||||
|
old_file = os.path.join(comfyui_models_dir, "checkpoints", "15", "pytorch_model.fp16.safetensors")
|
||||||
|
new_file = os.path.join(comfyui_models_dir, "checkpoints", "15", "anytext_v1.1.safetensors")
|
||||||
|
os.rename(old_file, new_file)
|
||||||
|
ckpt_path = new_file
|
||||||
|
if "Auto_DownLoad" not in clip_path:
|
||||||
|
clip_path = clip_path
|
||||||
|
else:
|
||||||
|
clip_path = "openai/clip-vit-large-patch14"
|
||||||
|
self.clip_path = clip_path
|
||||||
|
self.ckpt_path = ckpt_path
|
||||||
|
|
||||||
|
if loaded_model_tensor == None:
|
||||||
|
self.model = create_model(self.cfg_path, cond_stage_path=self.clip_path, use_fp16=self.use_fp16)
|
||||||
|
if self.use_fp16:
|
||||||
|
self.model = self.model.half().to(self.device)
|
||||||
|
if all_to_device == True:
|
||||||
|
self.model.load_state_dict(load_state_dict(self.ckpt_path, location=device), strict=False)
|
||||||
|
else:
|
||||||
|
self.model.load_state_dict(load_state_dict(self.ckpt_path, location='cpu'), strict=False)
|
||||||
|
else:
|
||||||
|
self.model = loaded_model_tensor
|
||||||
|
self.model.to(device)
|
||||||
|
|
||||||
|
self.model.eval()
|
||||||
|
self.ddim_sampler = DDIMSampler(self.model, device=self.device)
|
||||||
|
if use_translator == True:
|
||||||
|
#加载中译英模型,模型地址https://modelscope.cn/models/iic/nlp_csanmt_translation_zh2en
|
||||||
|
if "utrobinmv/t5_translate_en_ru_zh_small_1024" in translator_path:
|
||||||
|
self.trans_pipe = "utrobinmv/t5_translate_en_ru_zh_small_1024"
|
||||||
|
else:
|
||||||
|
self.zh2en_path = os.path.join(folder_paths.models_dir, "prompt_generator", "nlp_csanmt_translation_zh2en")
|
||||||
|
if not os.access(os.path.join(self.zh2en_path, "tf_ckpts", "ckpt-0.data-00000-of-00001"), os.F_OK):
|
||||||
|
self.zh2en_path = "damo/nlp_csanmt_translation_zh2en"
|
||||||
|
if not is_module_imported('pipeline'):
|
||||||
|
from modelscope.pipelines import pipeline
|
||||||
|
if not is_module_imported('Tasks'):
|
||||||
|
from modelscope.utils.constant import Tasks
|
||||||
|
self.trans_pipe = pipeline(task=Tasks.translation, model=self.zh2en_path, device=self.device)
|
||||||
|
else:
|
||||||
|
self.trans_pipe = None
|
||||||
|
|
||||||
|
def __call__(self, input_tensor, font_path, cpu_offload, **forward_params):
|
||||||
|
if "Auto_DownLoad" not in font_path:
|
||||||
|
font_path = font_path
|
||||||
|
else:
|
||||||
|
if os.access(os.path.join(comfyui_models_dir, "fonts", "SourceHanSansSC-Medium.otf"), os.F_OK):
|
||||||
|
font_path = os.path.join(comfyui_models_dir, "fonts", "SourceHanSansSC-Medium.otf")
|
||||||
|
else:
|
||||||
|
hf_hub_download(repo_id="Sanster/AnyText", filename="SourceHanSansSC-Medium.otf",local_dir=os.path.join(comfyui_models_dir, "fonts"))
|
||||||
|
font_path = os.path.join(comfyui_models_dir, "fonts", "SourceHanSansSC-Medium.otf")
|
||||||
|
self.font = ImageFont.truetype(font_path, size=60, encoding='utf-8')
|
||||||
|
tic = time.time()
|
||||||
|
str_warning = ''
|
||||||
|
# get inputs
|
||||||
|
seed = input_tensor.get('seed', -1)
|
||||||
|
if seed == -1:
|
||||||
|
seed = random.randint(0, 99999999)
|
||||||
|
seed_everything(seed)
|
||||||
|
prompt = input_tensor.get('prompt')
|
||||||
|
draw_pos = input_tensor.get('draw_pos')
|
||||||
|
ori_image = input_tensor.get('ori_image')
|
||||||
|
|
||||||
|
mode = forward_params.get('mode')
|
||||||
|
use_fp16 = forward_params.get('use_fp16')
|
||||||
|
Random_Gen = forward_params.get('Random_Gen')
|
||||||
|
sort_priority = forward_params.get('sort_priority', '↕')
|
||||||
|
show_debug = forward_params.get('show_debug', False)
|
||||||
|
revise_pos = forward_params.get('revise_pos', False)
|
||||||
|
img_count = forward_params.get('image_count', 1)
|
||||||
|
ddim_steps = forward_params.get('ddim_steps', 20)
|
||||||
|
w = forward_params.get('image_width', 512)
|
||||||
|
h = forward_params.get('image_height', 512)
|
||||||
|
strength = forward_params.get('strength', 1.0)
|
||||||
|
cfg_scale = forward_params.get('cfg_scale', 9.0)
|
||||||
|
eta = forward_params.get('eta', 0.0)
|
||||||
|
a_prompt = forward_params.get('a_prompt', 'best quality, extremely detailed,4k, HD, supper legible text, clear text edges, clear strokes, neat writing, no watermarks')
|
||||||
|
n_prompt = forward_params.get('n_prompt', 'low-res, bad anatomy, extra digit, fewer digits, cropped, worst quality, low quality, watermark, unreadable text, messy words, distorted text, disorganized writing, advertising picture')
|
||||||
|
|
||||||
|
prompt, texts = self.modify_prompt(prompt)
|
||||||
|
if prompt is None and texts is None:
|
||||||
|
return None, -1, "You have input Chinese prompt but the translator is not loaded!", ""
|
||||||
|
n_lines = len(texts)
|
||||||
|
if mode in ['text-generation', 'gen']:
|
||||||
|
if Random_Gen == True:
|
||||||
|
edit_image = np.ones((h, w, 3)) * 127.5 # empty mask image
|
||||||
|
edit_image = resize_image(edit_image, max_length=768)
|
||||||
|
h, w = edit_image.shape[:2]
|
||||||
|
else:
|
||||||
|
edit_image = cv2.imread(draw_pos)[..., ::-1]
|
||||||
|
edit_image = resize_image(edit_image, max_length=768)
|
||||||
|
h, w = edit_image.shape[:2]
|
||||||
|
edit_image = np.ones((h, w, 3)) * 127.5 # empty mask image
|
||||||
|
elif mode in ['text-editing', 'edit']:
|
||||||
|
if draw_pos is None or ori_image is None:
|
||||||
|
return None, -1, "Reference image and position image are needed for text editing!", ""
|
||||||
|
if isinstance(ori_image, str):
|
||||||
|
ori_image = cv2.imread(ori_image)[..., ::-1]
|
||||||
|
assert ori_image is not None, f"Can't read ori_image image from{ori_image}!"
|
||||||
|
elif isinstance(ori_image, torch.Tensor):
|
||||||
|
ori_image = ori_image.cpu().numpy()
|
||||||
|
else:
|
||||||
|
assert isinstance(ori_image, np.ndarray), f'Unknown format of ori_image: {type(ori_image)}'
|
||||||
|
edit_image = ori_image.clip(1, 255) # for mask reason
|
||||||
|
edit_image = check_channels(edit_image)
|
||||||
|
edit_image = resize_image(edit_image, max_length=768) # make w h multiple of 64, resize if w or h > max_length
|
||||||
|
h, w = edit_image.shape[:2] # change h, w by input ref_img
|
||||||
|
# preprocess pos_imgs(if numpy, make sure it's white pos in black bg)
|
||||||
|
if draw_pos is None:
|
||||||
|
pos_imgs = np.zeros((w, h, 1))
|
||||||
|
if isinstance(draw_pos, str):
|
||||||
|
draw_pos = cv2.imread(draw_pos)[..., ::-1]
|
||||||
|
draw_pos = resize_image(draw_pos, max_length=768)
|
||||||
|
draw_pos = cv2.resize(draw_pos, (w, h))
|
||||||
|
assert draw_pos is not None, f"Can't read draw_pos image from{draw_pos}!"
|
||||||
|
pos_imgs = 255-draw_pos
|
||||||
|
elif isinstance(draw_pos, torch.Tensor):
|
||||||
|
pos_imgs = draw_pos.cpu().numpy()
|
||||||
|
else:
|
||||||
|
assert isinstance(draw_pos, np.ndarray), f'Unknown format of draw_pos: {type(draw_pos)}'
|
||||||
|
pos_imgs = pos_imgs[..., 0:1]
|
||||||
|
pos_imgs = cv2.convertScaleAbs(pos_imgs)
|
||||||
|
_, pos_imgs = cv2.threshold(pos_imgs, 254, 255, cv2.THRESH_BINARY)
|
||||||
|
# seprate pos_imgs
|
||||||
|
pos_imgs = self.separate_pos_imgs(pos_imgs, sort_priority)
|
||||||
|
if len(pos_imgs) == 0:
|
||||||
|
pos_imgs = [np.zeros((h, w, 1))]
|
||||||
|
if len(pos_imgs) < n_lines:
|
||||||
|
if n_lines == 1 and texts[0] == ' ':
|
||||||
|
pass # text-to-image without text
|
||||||
|
else:
|
||||||
|
return None, -1, f'Found {len(pos_imgs)} positions that < needed {n_lines} from prompt, check and try again!', ''
|
||||||
|
elif len(pos_imgs) > n_lines:
|
||||||
|
str_warning = f'Warning: found {len(pos_imgs)} positions that > needed {n_lines} from prompt.'
|
||||||
|
# get pre_pos, poly_list, hint that needed for anytext
|
||||||
|
pre_pos = []
|
||||||
|
poly_list = []
|
||||||
|
for input_pos in pos_imgs:
|
||||||
|
if input_pos.mean() != 0:
|
||||||
|
input_pos = input_pos[..., np.newaxis] if len(input_pos.shape) == 2 else input_pos
|
||||||
|
poly, pos_img = self.find_polygon(input_pos)
|
||||||
|
pre_pos += [pos_img/255.]
|
||||||
|
poly_list += [poly]
|
||||||
|
else:
|
||||||
|
pre_pos += [np.zeros((h, w, 1))]
|
||||||
|
poly_list += [None]
|
||||||
|
np_hint = np.sum(pre_pos, axis=0).clip(0, 1)
|
||||||
|
# prepare info dict
|
||||||
|
info = {}
|
||||||
|
info['glyphs'] = []
|
||||||
|
info['gly_line'] = []
|
||||||
|
info['positions'] = []
|
||||||
|
info['n_lines'] = [len(texts)]*img_count
|
||||||
|
gly_pos_imgs = []
|
||||||
|
for i in range(len(texts)):
|
||||||
|
text = texts[i]
|
||||||
|
if len(text) > max_chars:
|
||||||
|
str_warning = f'"{text}" length > max_chars: {max_chars}, will be cut off...'
|
||||||
|
text = text[:max_chars]
|
||||||
|
gly_scale = 2
|
||||||
|
if pre_pos[i].mean() != 0:
|
||||||
|
gly_line = draw_glyph(self.font, text)
|
||||||
|
glyphs = draw_glyph2(self.font, text, poly_list[i], scale=gly_scale, width=w, height=h, add_space=False)
|
||||||
|
gly_pos_img = cv2.drawContours(glyphs*255, [poly_list[i]*gly_scale], 0, (255, 255, 255), 1)
|
||||||
|
if revise_pos:
|
||||||
|
resize_gly = cv2.resize(glyphs, (pre_pos[i].shape[1], pre_pos[i].shape[0]))
|
||||||
|
new_pos = cv2.morphologyEx((resize_gly*255).astype(np.uint8), cv2.MORPH_CLOSE, kernel=np.ones((resize_gly.shape[0]//10, resize_gly.shape[1]//10), dtype=np.uint8), iterations=1)
|
||||||
|
new_pos = new_pos[..., np.newaxis] if len(new_pos.shape) == 2 else new_pos
|
||||||
|
contours, _ = cv2.findContours(new_pos, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
||||||
|
if len(contours) != 1:
|
||||||
|
str_warning = f'Fail to revise position {i} to bounding rect, remain position unchanged...'
|
||||||
|
else:
|
||||||
|
rect = cv2.minAreaRect(contours[0])
|
||||||
|
poly = np.int0(cv2.boxPoints(rect))
|
||||||
|
pre_pos[i] = cv2.drawContours(new_pos, [poly], -1, 255, -1) / 255.
|
||||||
|
gly_pos_img = cv2.drawContours(glyphs*255, [poly*gly_scale], 0, (255, 255, 255), 1)
|
||||||
|
gly_pos_imgs += [gly_pos_img] # for show
|
||||||
|
else:
|
||||||
|
glyphs = np.zeros((h*gly_scale, w*gly_scale, 1))
|
||||||
|
gly_line = np.zeros((80, 512, 1))
|
||||||
|
gly_pos_imgs += [np.zeros((h*gly_scale, w*gly_scale, 1))] # for show
|
||||||
|
pos = pre_pos[i]
|
||||||
|
info['glyphs'] += [self.arr2tensor(glyphs, img_count, use_fp16)]
|
||||||
|
info['gly_line'] += [self.arr2tensor(gly_line, img_count, use_fp16)]
|
||||||
|
info['positions'] += [self.arr2tensor(pos, img_count, use_fp16)]
|
||||||
|
# get masked_x
|
||||||
|
masked_img = ((edit_image.astype(np.float32) / 127.5) - 1.0)*(1-np_hint)
|
||||||
|
masked_img = np.transpose(masked_img, (2, 0, 1))
|
||||||
|
masked_img = torch.from_numpy(masked_img.copy()).float().to(self.device)
|
||||||
|
# 确保模型在正确的设备上
|
||||||
|
self.model = self.model.to(self.device)
|
||||||
|
# 将masked_img移动到正确的设备并设置正确的数据类型
|
||||||
|
masked_img = masked_img.to(self.device)
|
||||||
|
if self.use_fp16:
|
||||||
|
masked_img = masked_img.half()
|
||||||
|
else:
|
||||||
|
masked_img = masked_img.float()
|
||||||
|
encoder_posterior = self.model.encode_first_stage(masked_img[None, ...])
|
||||||
|
masked_x = self.model.get_first_stage_encoding(encoder_posterior).detach()
|
||||||
|
if self.use_fp16:
|
||||||
|
masked_x = masked_x.half()
|
||||||
|
info['masked_x'] = torch.cat([masked_x for _ in range(img_count)], dim=0)
|
||||||
|
|
||||||
|
hint = self.arr2tensor(np_hint, img_count, use_fp16)
|
||||||
|
cond = self.model.get_learned_conditioning(dict(c_concat=[hint], c_crossattn=[[prompt + ' , ' + a_prompt] * img_count], text_info=info))
|
||||||
|
un_cond = self.model.get_learned_conditioning(dict(c_concat=[hint], c_crossattn=[[n_prompt] * img_count], text_info=info))
|
||||||
|
shape = (4, h // 8, w // 8)
|
||||||
|
self.model.control_scales = ([strength] * 13)
|
||||||
|
samples, intermediates = self.ddim_sampler.sample(ddim_steps, img_count,
|
||||||
|
shape, cond, verbose=False, eta=eta,
|
||||||
|
unconditional_guidance_scale=cfg_scale,
|
||||||
|
unconditional_conditioning=un_cond)
|
||||||
|
if self.use_fp16:
|
||||||
|
samples = samples.half()
|
||||||
|
x_samples = self.model.decode_first_stage(samples)
|
||||||
|
x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
|
||||||
|
results = [x_samples[i] for i in range(img_count)]
|
||||||
|
if len(gly_pos_imgs) > 0 and show_debug:
|
||||||
|
glyph_bs = np.stack(gly_pos_imgs, axis=2)
|
||||||
|
glyph_img = np.sum(glyph_bs, axis=2) * 255
|
||||||
|
glyph_img = glyph_img.clip(0, 255).astype(np.uint8)
|
||||||
|
results += [np.repeat(glyph_img, 3, axis=2)]
|
||||||
|
input_prompt = prompt
|
||||||
|
for t in texts:
|
||||||
|
input_prompt = input_prompt.replace('*', f'"{t}"', 1)
|
||||||
|
print(f'Prompt: {input_prompt}')
|
||||||
|
# debug_info
|
||||||
|
if not show_debug:
|
||||||
|
debug_info = ''
|
||||||
|
else:
|
||||||
|
debug_info = f'\033[93mPrompt(提示词): {input_prompt}\n\033[0m \
|
||||||
|
\033[93mSize(尺寸): {w}x{h}\n\033[0m \
|
||||||
|
\033[93mImage Count(生成数量): {img_count}\n\033[0m \
|
||||||
|
\033[93mSeed(种子): {seed}\n\033[0m \
|
||||||
|
\033[93mUse FP16(使用FP16): {self.use_fp16}\n\033[0m \
|
||||||
|
\033[93mUse Device(使用设备): {self.device}\n\033[0m \
|
||||||
|
\033[93mCost Time(生成耗时): {(time.time()-tic):.2f}s\033[0m'
|
||||||
|
rst_code = 1 if str_warning else 0
|
||||||
|
|
||||||
|
if cpu_offload == True:
|
||||||
|
self.model.to('cpu')
|
||||||
|
else:
|
||||||
|
if self.model != None:
|
||||||
|
del self.model
|
||||||
|
import gc
|
||||||
|
gc.collect()
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
self.model = None
|
||||||
|
|
||||||
|
return x_samples, results, rst_code, str_warning, debug_info, self.model
|
||||||
|
|
||||||
|
def modify_prompt(self, prompt):
|
||||||
|
prompt = prompt.replace('“', '"')
|
||||||
|
prompt = prompt.replace('”', '"')
|
||||||
|
p = '"(.*?)"'
|
||||||
|
strs = re.findall(p, prompt)
|
||||||
|
if len(strs) == 0:
|
||||||
|
strs = [' ']
|
||||||
|
else:
|
||||||
|
for s in strs:
|
||||||
|
prompt = prompt.replace(f'"{s}"', f' {PLACE_HOLDER} ', 1)
|
||||||
|
if self.is_chinese(prompt):
|
||||||
|
if self.trans_pipe is None:
|
||||||
|
return None, None
|
||||||
|
old_prompt = prompt
|
||||||
|
if self.trans_pipe == "utrobinmv/t5_translate_en_ru_zh_small_1024":
|
||||||
|
self.zh2en_path = os.path.join(folder_paths.models_dir, "prompt_generator", "models--utrobinmv--t5_translate_en_ru_zh_small_1024")
|
||||||
|
if not os.access(os.path.join(self.zh2en_path, "model.safetensors"), os.F_OK):
|
||||||
|
self.zh2en_path = "utrobinmv/t5_translate_en_ru_zh_small_1024"
|
||||||
|
prompt = t5_translate_en_ru_zh('en', prompt + ' .', self.zh2en_path, self.device)[0]
|
||||||
|
else:
|
||||||
|
prompt = self.trans_pipe(input=prompt + ' .')['translation'][:-1]
|
||||||
|
del self.trans_pipe
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
print(f'Translate: {old_prompt} --> {prompt}')
|
||||||
|
return prompt, strs
|
||||||
|
|
||||||
|
def is_chinese(self, text):
|
||||||
|
text = checker._clean_text(text)
|
||||||
|
for char in text:
|
||||||
|
cp = ord(char)
|
||||||
|
if checker._is_chinese_char(cp):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def separate_pos_imgs(self, img, sort_priority, gap=102):
|
||||||
|
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(img)
|
||||||
|
components = []
|
||||||
|
for label in range(1, num_labels):
|
||||||
|
component = np.zeros_like(img)
|
||||||
|
component[labels == label] = 255
|
||||||
|
components.append((component, centroids[label]))
|
||||||
|
if sort_priority == '↕':
|
||||||
|
fir, sec = 1, 0 # top-down first
|
||||||
|
elif sort_priority == '↔':
|
||||||
|
fir, sec = 0, 1 # left-right first
|
||||||
|
components.sort(key=lambda c: (c[1][fir]//gap, c[1][sec]//gap))
|
||||||
|
sorted_components = [c[0] for c in components]
|
||||||
|
return sorted_components
|
||||||
|
|
||||||
|
def find_polygon(self, image, min_rect=False):
|
||||||
|
contours, hierarchy = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
||||||
|
max_contour = max(contours, key=cv2.contourArea) # get contour with max area
|
||||||
|
if min_rect:
|
||||||
|
# get minimum enclosing rectangle
|
||||||
|
rect = cv2.minAreaRect(max_contour)
|
||||||
|
poly = np.int0(cv2.boxPoints(rect))
|
||||||
|
else:
|
||||||
|
# get approximate polygon
|
||||||
|
epsilon = 0.01 * cv2.arcLength(max_contour, True)
|
||||||
|
poly = cv2.approxPolyDP(max_contour, epsilon, True)
|
||||||
|
n, _, xy = poly.shape
|
||||||
|
poly = poly.reshape(n, xy)
|
||||||
|
cv2.drawContours(image, [poly], -1, 255, -1)
|
||||||
|
return poly, image
|
||||||
|
|
||||||
|
def arr2tensor(self, arr, bs, use_fp16):
|
||||||
|
self.use_fp16 = use_fp16
|
||||||
|
arr = np.transpose(arr, (2, 0, 1))
|
||||||
|
_arr = torch.from_numpy(arr.copy()).float().to(self.device)
|
||||||
|
if self.use_fp16:
|
||||||
|
_arr = _arr.half()
|
||||||
|
_arr = torch.stack([_arr for _ in range(bs)], dim=0)
|
||||||
|
return _arr
|
24
AnyText/AnyText_scripts/AnyText_pipeline_util.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import cv2
|
||||||
|
|
||||||
|
def check_channels(image):
|
||||||
|
channels = image.shape[2] if len(image.shape) == 3 else 1
|
||||||
|
if channels == 1:
|
||||||
|
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
|
||||||
|
elif channels > 3:
|
||||||
|
image = image[:, :, :3]
|
||||||
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
def resize_image(img, max_length=768):
|
||||||
|
height, width = img.shape[:2]
|
||||||
|
max_dimension = max(height, width)
|
||||||
|
|
||||||
|
if max_dimension > max_length:
|
||||||
|
scale_factor = max_length / max_dimension
|
||||||
|
new_width = int(round(width * scale_factor))
|
||||||
|
new_height = int(round(height * scale_factor))
|
||||||
|
new_size = (new_width, new_height)
|
||||||
|
img = cv2.resize(img, new_size)
|
||||||
|
height, width = img.shape[:2]
|
||||||
|
img = cv2.resize(img, (width-(width % 64), height-(height % 64)))
|
||||||
|
return img
|
454
AnyText/AnyText_scripts/AnyText_t3_dataset.py
Normal file
@ -0,0 +1,454 @@
|
|||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import cv2
|
||||||
|
import random
|
||||||
|
import math
|
||||||
|
import time
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
from torch.utils.data import Dataset, DataLoader
|
||||||
|
from .AnyText_dataset_util import load, show_bbox_on_image
|
||||||
|
|
||||||
|
|
||||||
|
phrase_list = [
|
||||||
|
', content and position of the texts are ',
|
||||||
|
', textual material depicted in the image are ',
|
||||||
|
', texts that says ',
|
||||||
|
', captions shown in the snapshot are ',
|
||||||
|
', with the words of ',
|
||||||
|
', that reads ',
|
||||||
|
', the written materials on the picture: ',
|
||||||
|
', these texts are written on it: ',
|
||||||
|
', captions are ',
|
||||||
|
', content of the text in the graphic is '
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def insert_spaces(string, nSpace):
|
||||||
|
if nSpace == 0:
|
||||||
|
return string
|
||||||
|
new_string = ""
|
||||||
|
for char in string:
|
||||||
|
new_string += char + " " * nSpace
|
||||||
|
return new_string[:-nSpace]
|
||||||
|
|
||||||
|
|
||||||
|
def draw_glyph(font, text):
|
||||||
|
g_size = 50
|
||||||
|
W, H = (512, 80)
|
||||||
|
new_font = font.font_variant(size=g_size)
|
||||||
|
img = Image.new(mode='1', size=(W, H), color=0)
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
left, top, right, bottom = new_font.getbbox(text)
|
||||||
|
text_width = max(right-left, 5)
|
||||||
|
text_height = max(bottom - top, 5)
|
||||||
|
ratio = min(W*0.9/text_width, H*0.9/text_height)
|
||||||
|
new_font = font.font_variant(size=int(g_size*ratio))
|
||||||
|
|
||||||
|
# text_width, text_height = new_font.getsize(text)
|
||||||
|
#增加使用pillow>9.5
|
||||||
|
x0, y0, x1, y1 = new_font.getbbox(text)
|
||||||
|
text_width, text_height = x1-x0, y1-y0
|
||||||
|
# offset_x, offset_y = new_font.getoffset(text)
|
||||||
|
#增加使用pillow>9.5
|
||||||
|
offset_x, offset_y = text_width, text_height
|
||||||
|
x = (img.width - text_width) // 2
|
||||||
|
y = (img.height - text_height) // 2 - offset_y//2
|
||||||
|
draw.text((x, y), text, font=new_font, fill='white')
|
||||||
|
img = np.expand_dims(np.array(img), axis=2).astype(np.float64)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def draw_glyph2(font, text, polygon, vertAng=10, scale=1, width=512, height=512, add_space=True):
|
||||||
|
enlarge_polygon = polygon*scale
|
||||||
|
rect = cv2.minAreaRect(enlarge_polygon)
|
||||||
|
box = cv2.boxPoints(rect)
|
||||||
|
box = np.int0(box)
|
||||||
|
w, h = rect[1]
|
||||||
|
angle = rect[2]
|
||||||
|
if angle < -45:
|
||||||
|
angle += 90
|
||||||
|
angle = -angle
|
||||||
|
if w < h:
|
||||||
|
angle += 90
|
||||||
|
|
||||||
|
vert = False
|
||||||
|
if (abs(angle) % 90 < vertAng or abs(90-abs(angle) % 90) % 90 < vertAng):
|
||||||
|
_w = max(box[:, 0]) - min(box[:, 0])
|
||||||
|
_h = max(box[:, 1]) - min(box[:, 1])
|
||||||
|
if _h >= _w:
|
||||||
|
vert = True
|
||||||
|
angle = 0
|
||||||
|
|
||||||
|
img = np.zeros((height*scale, width*scale, 3), np.uint8)
|
||||||
|
img = Image.fromarray(img)
|
||||||
|
|
||||||
|
# infer font size
|
||||||
|
image4ratio = Image.new("RGB", img.size, "white")
|
||||||
|
draw = ImageDraw.Draw(image4ratio)
|
||||||
|
_, _, _tw, _th = draw.textbbox(xy=(0, 0), text=text, font=font)
|
||||||
|
text_w = min(w, h) * (_tw / _th)
|
||||||
|
if text_w <= max(w, h):
|
||||||
|
# add space
|
||||||
|
if len(text) > 1 and not vert and add_space:
|
||||||
|
for i in range(1, 100):
|
||||||
|
text_space = insert_spaces(text, i)
|
||||||
|
_, _, _tw2, _th2 = draw.textbbox(xy=(0, 0), text=text_space, font=font)
|
||||||
|
if min(w, h) * (_tw2 / _th2) > max(w, h):
|
||||||
|
break
|
||||||
|
text = insert_spaces(text, i-1)
|
||||||
|
font_size = min(w, h)*0.80
|
||||||
|
else:
|
||||||
|
shrink = 0.75 if vert else 0.85
|
||||||
|
font_size = min(w, h) / (text_w/max(w, h)) * shrink
|
||||||
|
new_font = font.font_variant(size=int(font_size))
|
||||||
|
|
||||||
|
left, top, right, bottom = new_font.getbbox(text)
|
||||||
|
text_width = right-left
|
||||||
|
text_height = bottom - top
|
||||||
|
|
||||||
|
layer = Image.new('RGBA', img.size, (0, 0, 0, 0))
|
||||||
|
draw = ImageDraw.Draw(layer)
|
||||||
|
if not vert:
|
||||||
|
draw.text((rect[0][0]-text_width//2, rect[0][1]-text_height//2-top), text, font=new_font, fill=(255, 255, 255, 255))
|
||||||
|
else:
|
||||||
|
x_s = min(box[:, 0]) + _w//2 - text_height//2
|
||||||
|
y_s = min(box[:, 1])
|
||||||
|
for c in text:
|
||||||
|
draw.text((x_s, y_s), c, font=new_font, fill=(255, 255, 255, 255))
|
||||||
|
_, _t, _, _b = new_font.getbbox(c)
|
||||||
|
y_s += _b
|
||||||
|
|
||||||
|
rotated_layer = layer.rotate(angle, expand=1, center=(rect[0][0], rect[0][1]))
|
||||||
|
|
||||||
|
x_offset = int((img.width - rotated_layer.width) / 2)
|
||||||
|
y_offset = int((img.height - rotated_layer.height) / 2)
|
||||||
|
img.paste(rotated_layer, (x_offset, y_offset), rotated_layer)
|
||||||
|
img = np.expand_dims(np.array(img.convert('1')), axis=2).astype(np.float64)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def get_caption_pos(ori_caption, pos_idxs, prob=1.0, place_holder='*'):
|
||||||
|
idx2pos = {
|
||||||
|
0: " top left",
|
||||||
|
1: " top",
|
||||||
|
2: " top right",
|
||||||
|
3: " left",
|
||||||
|
4: random.choice([" middle", " center"]),
|
||||||
|
5: " right",
|
||||||
|
6: " bottom left",
|
||||||
|
7: " bottom",
|
||||||
|
8: " bottom right"
|
||||||
|
}
|
||||||
|
new_caption = ori_caption + random.choice(phrase_list)
|
||||||
|
pos = ''
|
||||||
|
for i in range(len(pos_idxs)):
|
||||||
|
if random.random() < prob and pos_idxs[i] > 0:
|
||||||
|
pos += place_holder + random.choice([' located', ' placed', ' positioned', '']) + random.choice([' at', ' in', ' on']) + idx2pos[pos_idxs[i]] + ', '
|
||||||
|
else:
|
||||||
|
pos += place_holder + ' , '
|
||||||
|
pos = pos[:-2] + '.'
|
||||||
|
new_caption += pos
|
||||||
|
return new_caption
|
||||||
|
|
||||||
|
|
||||||
|
def generate_random_rectangles(w, h, box_num):
|
||||||
|
rectangles = []
|
||||||
|
for i in range(box_num):
|
||||||
|
x = random.randint(0, w)
|
||||||
|
y = random.randint(0, h)
|
||||||
|
w = random.randint(16, 256)
|
||||||
|
h = random.randint(16, 96)
|
||||||
|
angle = random.randint(-45, 45)
|
||||||
|
p1 = (x, y)
|
||||||
|
p2 = (x + w, y)
|
||||||
|
p3 = (x + w, y + h)
|
||||||
|
p4 = (x, y + h)
|
||||||
|
center = ((x + x + w) / 2, (y + y + h) / 2)
|
||||||
|
p1 = rotate_point(p1, center, angle)
|
||||||
|
p2 = rotate_point(p2, center, angle)
|
||||||
|
p3 = rotate_point(p3, center, angle)
|
||||||
|
p4 = rotate_point(p4, center, angle)
|
||||||
|
rectangles.append((p1, p2, p3, p4))
|
||||||
|
return rectangles
|
||||||
|
|
||||||
|
|
||||||
|
def rotate_point(point, center, angle):
|
||||||
|
# rotation
|
||||||
|
angle = math.radians(angle)
|
||||||
|
x = point[0] - center[0]
|
||||||
|
y = point[1] - center[1]
|
||||||
|
x1 = x * math.cos(angle) - y * math.sin(angle)
|
||||||
|
y1 = x * math.sin(angle) + y * math.cos(angle)
|
||||||
|
x1 += center[0]
|
||||||
|
y1 += center[1]
|
||||||
|
return int(x1), int(y1)
|
||||||
|
|
||||||
|
|
||||||
|
class T3DataSet(Dataset):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
json_path,
|
||||||
|
max_lines=5,
|
||||||
|
max_chars=20,
|
||||||
|
place_holder='*',
|
||||||
|
font_path='./font/Arial_Unicode.ttf',
|
||||||
|
caption_pos_prob=1.0,
|
||||||
|
mask_pos_prob=1.0,
|
||||||
|
mask_img_prob=0.5,
|
||||||
|
for_show=False,
|
||||||
|
using_dlc=False,
|
||||||
|
glyph_scale=1,
|
||||||
|
percent=1.0,
|
||||||
|
debug=False,
|
||||||
|
wm_thresh=1.0,
|
||||||
|
):
|
||||||
|
assert isinstance(json_path, (str, list))
|
||||||
|
if isinstance(json_path, str):
|
||||||
|
json_path = [json_path]
|
||||||
|
data_list = []
|
||||||
|
self.using_dlc = using_dlc
|
||||||
|
self.max_lines = max_lines
|
||||||
|
self.max_chars = max_chars
|
||||||
|
self.place_holder = place_holder
|
||||||
|
self.font = ImageFont.truetype(font_path, size=60)
|
||||||
|
self.caption_pos_porb = caption_pos_prob
|
||||||
|
self.mask_pos_prob = mask_pos_prob
|
||||||
|
self.mask_img_prob = mask_img_prob
|
||||||
|
self.for_show = for_show
|
||||||
|
self.glyph_scale = glyph_scale
|
||||||
|
self.wm_thresh = wm_thresh
|
||||||
|
for jp in json_path:
|
||||||
|
data_list += self.load_data(jp, percent)
|
||||||
|
self.data_list = data_list
|
||||||
|
print(f'All dataset loaded, imgs={len(self.data_list)}')
|
||||||
|
self.debug = debug
|
||||||
|
if self.debug:
|
||||||
|
self.tmp_items = [i for i in range(100)]
|
||||||
|
|
||||||
|
def load_data(self, json_path, percent):
|
||||||
|
tic = time.time()
|
||||||
|
content = load(json_path)
|
||||||
|
d = []
|
||||||
|
count = 0
|
||||||
|
wm_skip = 0
|
||||||
|
max_img = len(content['data_list']) * percent
|
||||||
|
for gt in content['data_list']:
|
||||||
|
if len(d) > max_img:
|
||||||
|
break
|
||||||
|
if 'wm_score' in gt and gt['wm_score'] > self.wm_thresh: # wm_score > thresh will be skiped as an img with watermark
|
||||||
|
wm_skip += 1
|
||||||
|
continue
|
||||||
|
data_root = content['data_root']
|
||||||
|
if self.using_dlc:
|
||||||
|
data_root = data_root.replace('/data/vdb', '/mnt/data', 1)
|
||||||
|
img_path = os.path.join(data_root, gt['img_name'])
|
||||||
|
info = {}
|
||||||
|
info['img_path'] = img_path
|
||||||
|
info['caption'] = gt['caption'] if 'caption' in gt else ''
|
||||||
|
if self.place_holder in info['caption']:
|
||||||
|
count += 1
|
||||||
|
info['caption'] = info['caption'].replace(self.place_holder, " ")
|
||||||
|
if 'annotations' in gt:
|
||||||
|
polygons = []
|
||||||
|
invalid_polygons = []
|
||||||
|
texts = []
|
||||||
|
languages = []
|
||||||
|
pos = []
|
||||||
|
for annotation in gt['annotations']:
|
||||||
|
if len(annotation['polygon']) == 0:
|
||||||
|
continue
|
||||||
|
if 'valid' in annotation and annotation['valid'] is False:
|
||||||
|
invalid_polygons.append(annotation['polygon'])
|
||||||
|
continue
|
||||||
|
polygons.append(annotation['polygon'])
|
||||||
|
texts.append(annotation['text'])
|
||||||
|
languages.append(annotation['language'])
|
||||||
|
if 'pos' in annotation:
|
||||||
|
pos.append(annotation['pos'])
|
||||||
|
info['polygons'] = [np.array(i) for i in polygons]
|
||||||
|
info['invalid_polygons'] = [np.array(i) for i in invalid_polygons]
|
||||||
|
info['texts'] = texts
|
||||||
|
info['language'] = languages
|
||||||
|
info['pos'] = pos
|
||||||
|
d.append(info)
|
||||||
|
print(f'{json_path} loaded, imgs={len(d)}, wm_skip={wm_skip}, time={(time.time()-tic):.2f}s')
|
||||||
|
if count > 0:
|
||||||
|
print(f"Found {count} image's caption contain placeholder: {self.place_holder}, change to ' '...")
|
||||||
|
return d
|
||||||
|
|
||||||
|
def __getitem__(self, item):
|
||||||
|
item_dict = {}
|
||||||
|
if self.debug: # sample fixed items
|
||||||
|
item = self.tmp_items.pop()
|
||||||
|
print(f'item = {item}')
|
||||||
|
cur_item = self.data_list[item]
|
||||||
|
# img
|
||||||
|
target = np.array(Image.open(cur_item['img_path']).convert('RGB'))
|
||||||
|
if target.shape[0] != 512 or target.shape[1] != 512:
|
||||||
|
target = cv2.resize(target, (512, 512))
|
||||||
|
target = (target.astype(np.float32) / 127.5) - 1.0
|
||||||
|
item_dict['img'] = target
|
||||||
|
# caption
|
||||||
|
item_dict['caption'] = cur_item['caption']
|
||||||
|
item_dict['glyphs'] = []
|
||||||
|
item_dict['gly_line'] = []
|
||||||
|
item_dict['positions'] = []
|
||||||
|
item_dict['texts'] = []
|
||||||
|
item_dict['language'] = []
|
||||||
|
item_dict['inv_mask'] = []
|
||||||
|
texts = cur_item.get('texts', [])
|
||||||
|
if len(texts) > 0:
|
||||||
|
idxs = [i for i in range(len(texts))]
|
||||||
|
if len(texts) > self.max_lines:
|
||||||
|
sel_idxs = random.sample(idxs, self.max_lines)
|
||||||
|
unsel_idxs = [i for i in idxs if i not in sel_idxs]
|
||||||
|
else:
|
||||||
|
sel_idxs = idxs
|
||||||
|
unsel_idxs = []
|
||||||
|
if len(cur_item['pos']) > 0:
|
||||||
|
pos_idxs = [cur_item['pos'][i] for i in sel_idxs]
|
||||||
|
else:
|
||||||
|
pos_idxs = [-1 for i in sel_idxs]
|
||||||
|
item_dict['caption'] = get_caption_pos(item_dict['caption'], pos_idxs, self.caption_pos_porb, self.place_holder)
|
||||||
|
item_dict['polygons'] = [cur_item['polygons'][i] for i in sel_idxs]
|
||||||
|
item_dict['texts'] = [cur_item['texts'][i][:self.max_chars] for i in sel_idxs]
|
||||||
|
item_dict['language'] = [cur_item['language'][i] for i in sel_idxs]
|
||||||
|
# glyphs
|
||||||
|
for idx, text in enumerate(item_dict['texts']):
|
||||||
|
gly_line = draw_glyph(self.font, text)
|
||||||
|
glyphs = draw_glyph2(self.font, text, item_dict['polygons'][idx], scale=self.glyph_scale)
|
||||||
|
item_dict['glyphs'] += [glyphs]
|
||||||
|
item_dict['gly_line'] += [gly_line]
|
||||||
|
# mask_pos
|
||||||
|
for polygon in item_dict['polygons']:
|
||||||
|
item_dict['positions'] += [self.draw_pos(polygon, self.mask_pos_prob)]
|
||||||
|
# inv_mask
|
||||||
|
invalid_polygons = cur_item['invalid_polygons'] if 'invalid_polygons' in cur_item else []
|
||||||
|
if len(texts) > 0:
|
||||||
|
invalid_polygons += [cur_item['polygons'][i] for i in unsel_idxs]
|
||||||
|
item_dict['inv_mask'] = self.draw_inv_mask(invalid_polygons)
|
||||||
|
item_dict['hint'] = self.get_hint(item_dict['positions'])
|
||||||
|
if random.random() < self.mask_img_prob:
|
||||||
|
# randomly generate 0~3 masks
|
||||||
|
box_num = random.randint(0, 3)
|
||||||
|
boxes = generate_random_rectangles(512, 512, box_num)
|
||||||
|
boxes = np.array(boxes)
|
||||||
|
pos_list = item_dict['positions'].copy()
|
||||||
|
for i in range(box_num):
|
||||||
|
pos_list += [self.draw_pos(boxes[i], self.mask_pos_prob)]
|
||||||
|
mask = self.get_hint(pos_list)
|
||||||
|
masked_img = target*(1-mask)
|
||||||
|
else:
|
||||||
|
masked_img = np.zeros_like(target)
|
||||||
|
item_dict['masked_img'] = masked_img
|
||||||
|
|
||||||
|
if self.for_show:
|
||||||
|
item_dict['img_name'] = os.path.split(cur_item['img_path'])[-1]
|
||||||
|
return item_dict
|
||||||
|
if len(texts) > 0:
|
||||||
|
del item_dict['polygons']
|
||||||
|
# padding
|
||||||
|
n_lines = min(len(texts), self.max_lines)
|
||||||
|
item_dict['n_lines'] = n_lines
|
||||||
|
n_pad = self.max_lines - n_lines
|
||||||
|
if n_pad > 0:
|
||||||
|
item_dict['glyphs'] += [np.zeros((512*self.glyph_scale, 512*self.glyph_scale, 1))] * n_pad
|
||||||
|
item_dict['gly_line'] += [np.zeros((80, 512, 1))] * n_pad
|
||||||
|
item_dict['positions'] += [np.zeros((512, 512, 1))] * n_pad
|
||||||
|
item_dict['texts'] += [' '] * n_pad
|
||||||
|
item_dict['language'] += [' '] * n_pad
|
||||||
|
|
||||||
|
return item_dict
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.data_list)
|
||||||
|
|
||||||
|
def draw_inv_mask(self, polygons):
|
||||||
|
img = np.zeros((512, 512))
|
||||||
|
for p in polygons:
|
||||||
|
pts = p.reshape((-1, 1, 2))
|
||||||
|
cv2.fillPoly(img, [pts], color=255)
|
||||||
|
img = img[..., None]
|
||||||
|
return img/255.
|
||||||
|
|
||||||
|
def draw_pos(self, ploygon, prob=1.0):
|
||||||
|
img = np.zeros((512, 512))
|
||||||
|
rect = cv2.minAreaRect(ploygon)
|
||||||
|
w, h = rect[1]
|
||||||
|
small = False
|
||||||
|
if w < 20 or h < 20:
|
||||||
|
small = True
|
||||||
|
if random.random() < prob:
|
||||||
|
pts = ploygon.reshape((-1, 1, 2))
|
||||||
|
cv2.fillPoly(img, [pts], color=255)
|
||||||
|
# 10% dilate / 10% erode / 5% dilatex2 5% erodex2
|
||||||
|
random_value = random.random()
|
||||||
|
kernel = np.ones((3, 3), dtype=np.uint8)
|
||||||
|
if random_value < 0.7:
|
||||||
|
pass
|
||||||
|
elif random_value < 0.8:
|
||||||
|
img = cv2.dilate(img.astype(np.uint8), kernel, iterations=1)
|
||||||
|
elif random_value < 0.9 and not small:
|
||||||
|
img = cv2.erode(img.astype(np.uint8), kernel, iterations=1)
|
||||||
|
elif random_value < 0.95:
|
||||||
|
img = cv2.dilate(img.astype(np.uint8), kernel, iterations=2)
|
||||||
|
elif random_value < 1.0 and not small:
|
||||||
|
img = cv2.erode(img.astype(np.uint8), kernel, iterations=2)
|
||||||
|
img = img[..., None]
|
||||||
|
return img/255.
|
||||||
|
|
||||||
|
def get_hint(self, positions):
|
||||||
|
if len(positions) == 0:
|
||||||
|
return np.zeros((512, 512, 1))
|
||||||
|
return np.sum(positions, axis=0).clip(0, 1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
'''
|
||||||
|
Run this script to show details of your dataset, such as ocr annotations, glyphs, prompts, etc.
|
||||||
|
'''
|
||||||
|
from tqdm import tqdm
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
show_imgs_dir = 'show_results'
|
||||||
|
show_count = 50
|
||||||
|
if os.path.exists(show_imgs_dir):
|
||||||
|
shutil.rmtree(show_imgs_dir)
|
||||||
|
os.makedirs(show_imgs_dir)
|
||||||
|
plt.rcParams['axes.unicode_minus'] = False
|
||||||
|
json_paths = [
|
||||||
|
'/path/of/your/dataset/data1.json',
|
||||||
|
'/path/of/your/dataset/data2.json',
|
||||||
|
# ...
|
||||||
|
]
|
||||||
|
|
||||||
|
dataset = T3DataSet(json_paths, for_show=True, max_lines=20, glyph_scale=2, mask_img_prob=1.0, caption_pos_prob=0.0)
|
||||||
|
train_loader = DataLoader(dataset=dataset, batch_size=1, shuffle=False, num_workers=0)
|
||||||
|
pbar = tqdm(total=show_count)
|
||||||
|
for i, data in enumerate(train_loader):
|
||||||
|
if i == show_count:
|
||||||
|
break
|
||||||
|
img = ((data['img'][0].numpy() + 1.0) / 2.0 * 255).astype(np.uint8)
|
||||||
|
masked_img = ((data['masked_img'][0].numpy() + 1.0) / 2.0 * 255)[..., ::-1].astype(np.uint8)
|
||||||
|
cv2.imwrite(os.path.join(show_imgs_dir, f'plots_{i}_masked.jpg'), masked_img)
|
||||||
|
if 'texts' in data and len(data['texts']) > 0:
|
||||||
|
texts = [x[0] for x in data['texts']]
|
||||||
|
img = show_bbox_on_image(Image.fromarray(img), data['polygons'], texts)
|
||||||
|
cv2.imwrite(os.path.join(show_imgs_dir, f'plots_{i}.jpg'), np.array(img)[..., ::-1])
|
||||||
|
with open(os.path.join(show_imgs_dir, f'plots_{i}.txt'), 'w') as fin:
|
||||||
|
fin.writelines([data['caption'][0]])
|
||||||
|
all_glyphs = []
|
||||||
|
for k, glyphs in enumerate(data['glyphs']):
|
||||||
|
cv2.imwrite(os.path.join(show_imgs_dir, f'plots_{i}_glyph_{k}.jpg'), glyphs[0].numpy().astype(np.int32)*255)
|
||||||
|
all_glyphs += [glyphs[0].numpy().astype(np.int32)*255]
|
||||||
|
cv2.imwrite(os.path.join(show_imgs_dir, f'plots_{i}_allglyphs.jpg'), np.sum(all_glyphs, axis=0))
|
||||||
|
for k, gly_line in enumerate(data['gly_line']):
|
||||||
|
cv2.imwrite(os.path.join(show_imgs_dir, f'plots_{i}_gly_line_{k}.jpg'), gly_line[0].numpy().astype(np.int32)*255)
|
||||||
|
for k, position in enumerate(data['positions']):
|
||||||
|
if position is not None:
|
||||||
|
cv2.imwrite(os.path.join(show_imgs_dir, f'plots_{i}_pos_{k}.jpg'), position[0].numpy().astype(np.int32)*255)
|
||||||
|
cv2.imwrite(os.path.join(show_imgs_dir, f'plots_{i}_hint.jpg'), data['hint'][0].numpy().astype(np.int32)*255)
|
||||||
|
cv2.imwrite(os.path.join(show_imgs_dir, f'plots_{i}_inv_mask.jpg'), np.array(img)[..., ::-1]*(1-data['inv_mask'][0].numpy().astype(np.int32)))
|
||||||
|
pbar.update(1)
|
||||||
|
pbar.close()
|
627
AnyText/AnyText_scripts/cldm/cldm.py
Normal file
@ -0,0 +1,627 @@
|
|||||||
|
import einops
|
||||||
|
import torch
|
||||||
|
import torch as th
|
||||||
|
import torch.nn as nn
|
||||||
|
import copy
|
||||||
|
from easydict import EasyDict as edict
|
||||||
|
|
||||||
|
from ..ldm.modules.diffusionmodules.util import (
|
||||||
|
conv_nd,
|
||||||
|
linear,
|
||||||
|
zero_module,
|
||||||
|
timestep_embedding,
|
||||||
|
)
|
||||||
|
|
||||||
|
from einops import rearrange, repeat
|
||||||
|
from torchvision.utils import make_grid
|
||||||
|
from ..ldm.modules.attention import SpatialTransformer
|
||||||
|
from ..ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock
|
||||||
|
from ..ldm.models.diffusion.ddpm import LatentDiffusion
|
||||||
|
from ..ldm.util import log_txt_as_img, exists, instantiate_from_config
|
||||||
|
# from ldm.models.diffusion.ddim import DDIMSampler
|
||||||
|
from .ddim_hacked import DDIMSampler
|
||||||
|
from ..ldm.modules.distributions.distributions import DiagonalGaussianDistribution
|
||||||
|
from .recognizer import TextRecognizer, create_predictor
|
||||||
|
|
||||||
|
|
||||||
|
def count_parameters(model):
|
||||||
|
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||||
|
|
||||||
|
|
||||||
|
class ControlledUnetModel(UNetModel):
|
||||||
|
def forward(self, x, timesteps=None, context=None, control=None, only_mid_control=False, **kwargs):
|
||||||
|
hs = []
|
||||||
|
with torch.no_grad():
|
||||||
|
t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
|
||||||
|
if self.use_fp16:
|
||||||
|
t_emb = t_emb.half()
|
||||||
|
emb = self.time_embed(t_emb)
|
||||||
|
h = x.type(self.dtype)
|
||||||
|
for module in self.input_blocks:
|
||||||
|
h = module(h, emb, context)
|
||||||
|
hs.append(h)
|
||||||
|
h = self.middle_block(h, emb, context)
|
||||||
|
|
||||||
|
if control is not None:
|
||||||
|
h += control.pop()
|
||||||
|
|
||||||
|
for i, module in enumerate(self.output_blocks):
|
||||||
|
if only_mid_control or control is None:
|
||||||
|
h = torch.cat([h, hs.pop()], dim=1)
|
||||||
|
else:
|
||||||
|
h = torch.cat([h, hs.pop() + control.pop()], dim=1)
|
||||||
|
h = module(h, emb, context)
|
||||||
|
|
||||||
|
h = h.type(x.dtype)
|
||||||
|
return self.out(h)
|
||||||
|
|
||||||
|
|
||||||
|
class ControlNet(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
image_size,
|
||||||
|
in_channels,
|
||||||
|
model_channels,
|
||||||
|
glyph_channels,
|
||||||
|
position_channels,
|
||||||
|
num_res_blocks,
|
||||||
|
attention_resolutions,
|
||||||
|
dropout=0,
|
||||||
|
channel_mult=(1, 2, 4, 8),
|
||||||
|
conv_resample=True,
|
||||||
|
dims=2,
|
||||||
|
use_checkpoint=False,
|
||||||
|
use_fp16=False,
|
||||||
|
num_heads=-1,
|
||||||
|
num_head_channels=-1,
|
||||||
|
num_heads_upsample=-1,
|
||||||
|
use_scale_shift_norm=False,
|
||||||
|
resblock_updown=False,
|
||||||
|
use_new_attention_order=False,
|
||||||
|
use_spatial_transformer=False, # custom transformer support
|
||||||
|
transformer_depth=1, # custom transformer support
|
||||||
|
context_dim=None, # custom transformer support
|
||||||
|
n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model
|
||||||
|
legacy=True,
|
||||||
|
disable_self_attentions=None,
|
||||||
|
num_attention_blocks=None,
|
||||||
|
disable_middle_self_attn=False,
|
||||||
|
use_linear_in_transformer=False,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
if use_spatial_transformer:
|
||||||
|
assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
|
||||||
|
|
||||||
|
if context_dim is not None:
|
||||||
|
assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
|
||||||
|
from omegaconf.listconfig import ListConfig
|
||||||
|
if type(context_dim) == ListConfig:
|
||||||
|
context_dim = list(context_dim)
|
||||||
|
|
||||||
|
if num_heads_upsample == -1:
|
||||||
|
num_heads_upsample = num_heads
|
||||||
|
|
||||||
|
if num_heads == -1:
|
||||||
|
assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
|
||||||
|
|
||||||
|
if num_head_channels == -1:
|
||||||
|
assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
|
||||||
|
self.dims = dims
|
||||||
|
self.image_size = image_size
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.model_channels = model_channels
|
||||||
|
if isinstance(num_res_blocks, int):
|
||||||
|
self.num_res_blocks = len(channel_mult) * [num_res_blocks]
|
||||||
|
else:
|
||||||
|
if len(num_res_blocks) != len(channel_mult):
|
||||||
|
raise ValueError("provide num_res_blocks either as an int (globally constant) or "
|
||||||
|
"as a list/tuple (per-level) with the same length as channel_mult")
|
||||||
|
self.num_res_blocks = num_res_blocks
|
||||||
|
if disable_self_attentions is not None:
|
||||||
|
# should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
|
||||||
|
assert len(disable_self_attentions) == len(channel_mult)
|
||||||
|
if num_attention_blocks is not None:
|
||||||
|
assert len(num_attention_blocks) == len(self.num_res_blocks)
|
||||||
|
assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
|
||||||
|
print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
|
||||||
|
f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
|
||||||
|
f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
|
||||||
|
f"attention will still not be set.")
|
||||||
|
self.attention_resolutions = attention_resolutions
|
||||||
|
self.dropout = dropout
|
||||||
|
self.channel_mult = channel_mult
|
||||||
|
self.conv_resample = conv_resample
|
||||||
|
self.use_checkpoint = use_checkpoint
|
||||||
|
self.use_fp16 = use_fp16
|
||||||
|
self.dtype = th.float16 if use_fp16 else th.float32
|
||||||
|
self.num_heads = num_heads
|
||||||
|
self.num_head_channels = num_head_channels
|
||||||
|
self.num_heads_upsample = num_heads_upsample
|
||||||
|
self.predict_codebook_ids = n_embed is not None
|
||||||
|
|
||||||
|
time_embed_dim = model_channels * 4
|
||||||
|
self.time_embed = nn.Sequential(
|
||||||
|
linear(model_channels, time_embed_dim),
|
||||||
|
nn.SiLU(),
|
||||||
|
linear(time_embed_dim, time_embed_dim),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.input_blocks = nn.ModuleList(
|
||||||
|
[
|
||||||
|
TimestepEmbedSequential(
|
||||||
|
conv_nd(dims, in_channels, model_channels, 3, padding=1)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])
|
||||||
|
|
||||||
|
self.glyph_block = TimestepEmbedSequential(
|
||||||
|
conv_nd(dims, glyph_channels, 8, 3, padding=1),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 8, 8, 3, padding=1),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 8, 16, 3, padding=1, stride=2),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 16, 16, 3, padding=1),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 16, 32, 3, padding=1, stride=2),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 32, 32, 3, padding=1),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 32, 96, 3, padding=1, stride=2),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 96, 96, 3, padding=1),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 96, 256, 3, padding=1, stride=2),
|
||||||
|
nn.SiLU(),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.position_block = TimestepEmbedSequential(
|
||||||
|
conv_nd(dims, position_channels, 8, 3, padding=1),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 8, 8, 3, padding=1),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 8, 16, 3, padding=1, stride=2),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 16, 16, 3, padding=1),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 16, 32, 3, padding=1, stride=2),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 32, 32, 3, padding=1),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, 32, 64, 3, padding=1, stride=2),
|
||||||
|
nn.SiLU(),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.fuse_block = zero_module(conv_nd(dims, 256+64+4, model_channels, 3, padding=1))
|
||||||
|
|
||||||
|
self._feature_size = model_channels
|
||||||
|
input_block_chans = [model_channels]
|
||||||
|
ch = model_channels
|
||||||
|
ds = 1
|
||||||
|
for level, mult in enumerate(channel_mult):
|
||||||
|
for nr in range(self.num_res_blocks[level]):
|
||||||
|
layers = [
|
||||||
|
ResBlock(
|
||||||
|
ch,
|
||||||
|
time_embed_dim,
|
||||||
|
dropout,
|
||||||
|
out_channels=mult * model_channels,
|
||||||
|
dims=dims,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
use_scale_shift_norm=use_scale_shift_norm,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
ch = mult * model_channels
|
||||||
|
if ds in attention_resolutions:
|
||||||
|
if num_head_channels == -1:
|
||||||
|
dim_head = ch // num_heads
|
||||||
|
else:
|
||||||
|
num_heads = ch // num_head_channels
|
||||||
|
dim_head = num_head_channels
|
||||||
|
if legacy:
|
||||||
|
# num_heads = 1
|
||||||
|
dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
|
||||||
|
if exists(disable_self_attentions):
|
||||||
|
disabled_sa = disable_self_attentions[level]
|
||||||
|
else:
|
||||||
|
disabled_sa = False
|
||||||
|
|
||||||
|
if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
|
||||||
|
layers.append(
|
||||||
|
AttentionBlock(
|
||||||
|
ch,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
num_heads=num_heads,
|
||||||
|
num_head_channels=dim_head,
|
||||||
|
use_new_attention_order=use_new_attention_order,
|
||||||
|
) if not use_spatial_transformer else SpatialTransformer(
|
||||||
|
ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
|
||||||
|
disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
|
||||||
|
use_checkpoint=use_checkpoint
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.input_blocks.append(TimestepEmbedSequential(*layers))
|
||||||
|
self.zero_convs.append(self.make_zero_conv(ch))
|
||||||
|
self._feature_size += ch
|
||||||
|
input_block_chans.append(ch)
|
||||||
|
if level != len(channel_mult) - 1:
|
||||||
|
out_ch = ch
|
||||||
|
self.input_blocks.append(
|
||||||
|
TimestepEmbedSequential(
|
||||||
|
ResBlock(
|
||||||
|
ch,
|
||||||
|
time_embed_dim,
|
||||||
|
dropout,
|
||||||
|
out_channels=out_ch,
|
||||||
|
dims=dims,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
use_scale_shift_norm=use_scale_shift_norm,
|
||||||
|
down=True,
|
||||||
|
)
|
||||||
|
if resblock_updown
|
||||||
|
else Downsample(
|
||||||
|
ch, conv_resample, dims=dims, out_channels=out_ch
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ch = out_ch
|
||||||
|
input_block_chans.append(ch)
|
||||||
|
self.zero_convs.append(self.make_zero_conv(ch))
|
||||||
|
ds *= 2
|
||||||
|
self._feature_size += ch
|
||||||
|
|
||||||
|
if num_head_channels == -1:
|
||||||
|
dim_head = ch // num_heads
|
||||||
|
else:
|
||||||
|
num_heads = ch // num_head_channels
|
||||||
|
dim_head = num_head_channels
|
||||||
|
if legacy:
|
||||||
|
# num_heads = 1
|
||||||
|
dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
|
||||||
|
self.middle_block = TimestepEmbedSequential(
|
||||||
|
ResBlock(
|
||||||
|
ch,
|
||||||
|
time_embed_dim,
|
||||||
|
dropout,
|
||||||
|
dims=dims,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
use_scale_shift_norm=use_scale_shift_norm,
|
||||||
|
),
|
||||||
|
AttentionBlock(
|
||||||
|
ch,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
num_heads=num_heads,
|
||||||
|
num_head_channels=dim_head,
|
||||||
|
use_new_attention_order=use_new_attention_order,
|
||||||
|
) if not use_spatial_transformer else SpatialTransformer( # always uses a self-attn
|
||||||
|
ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
|
||||||
|
disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
|
||||||
|
use_checkpoint=use_checkpoint
|
||||||
|
),
|
||||||
|
ResBlock(
|
||||||
|
ch,
|
||||||
|
time_embed_dim,
|
||||||
|
dropout,
|
||||||
|
dims=dims,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
use_scale_shift_norm=use_scale_shift_norm,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
self.middle_block_out = self.make_zero_conv(ch)
|
||||||
|
self._feature_size += ch
|
||||||
|
|
||||||
|
def make_zero_conv(self, channels):
|
||||||
|
return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
|
||||||
|
|
||||||
|
def forward(self, x, hint, text_info, timesteps, context, **kwargs):
|
||||||
|
t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
|
||||||
|
if self.use_fp16:
|
||||||
|
t_emb = t_emb.half()
|
||||||
|
emb = self.time_embed(t_emb)
|
||||||
|
|
||||||
|
# guided_hint from text_info
|
||||||
|
B, C, H, W = x.shape
|
||||||
|
glyphs = torch.cat(text_info['glyphs'], dim=1).sum(dim=1, keepdim=True)
|
||||||
|
positions = torch.cat(text_info['positions'], dim=1).sum(dim=1, keepdim=True)
|
||||||
|
enc_glyph = self.glyph_block(glyphs, emb, context)
|
||||||
|
enc_pos = self.position_block(positions, emb, context)
|
||||||
|
guided_hint = self.fuse_block(torch.cat([enc_glyph, enc_pos, text_info['masked_x']], dim=1))
|
||||||
|
|
||||||
|
outs = []
|
||||||
|
|
||||||
|
h = x.type(self.dtype)
|
||||||
|
for module, zero_conv in zip(self.input_blocks, self.zero_convs):
|
||||||
|
if guided_hint is not None:
|
||||||
|
h = module(h, emb, context)
|
||||||
|
h += guided_hint
|
||||||
|
guided_hint = None
|
||||||
|
else:
|
||||||
|
h = module(h, emb, context)
|
||||||
|
outs.append(zero_conv(h, emb, context))
|
||||||
|
|
||||||
|
h = self.middle_block(h, emb, context)
|
||||||
|
outs.append(self.middle_block_out(h, emb, context))
|
||||||
|
|
||||||
|
return outs
|
||||||
|
|
||||||
|
|
||||||
|
class ControlLDM(LatentDiffusion):
|
||||||
|
|
||||||
|
def __init__(self, control_stage_config, control_key, glyph_key, position_key, only_mid_control, loss_alpha=0, loss_beta=0, with_step_weight=False, use_vae_upsample=False, latin_weight=1.0, embedding_manager_config=None, *args, **kwargs):
|
||||||
|
self.use_fp16 = kwargs.pop('use_fp16', False)
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.control_model = instantiate_from_config(control_stage_config)
|
||||||
|
self.control_key = control_key
|
||||||
|
self.glyph_key = glyph_key
|
||||||
|
self.position_key = position_key
|
||||||
|
self.only_mid_control = only_mid_control
|
||||||
|
self.control_scales = [1.0] * 13
|
||||||
|
self.loss_alpha = loss_alpha
|
||||||
|
self.loss_beta = loss_beta
|
||||||
|
self.with_step_weight = with_step_weight
|
||||||
|
self.use_vae_upsample = use_vae_upsample
|
||||||
|
self.latin_weight = latin_weight
|
||||||
|
|
||||||
|
if embedding_manager_config is not None and embedding_manager_config.params.valid:
|
||||||
|
self.embedding_manager = self.instantiate_embedding_manager(embedding_manager_config, self.cond_stage_model)
|
||||||
|
for param in self.embedding_manager.embedding_parameters():
|
||||||
|
param.requires_grad = True
|
||||||
|
else:
|
||||||
|
self.embedding_manager = None
|
||||||
|
if self.loss_alpha > 0 or self.loss_beta > 0 or self.embedding_manager:
|
||||||
|
if embedding_manager_config.params.emb_type == 'ocr':
|
||||||
|
self.text_predictor = create_predictor().eval()
|
||||||
|
args = edict()
|
||||||
|
args.rec_image_shape = "3, 48, 320"
|
||||||
|
args.rec_batch_num = 6
|
||||||
|
args.rec_char_dict_path = './ocr_recog/ppocr_keys_v1.txt'
|
||||||
|
args.use_fp16 = self.use_fp16
|
||||||
|
self.cn_recognizer = TextRecognizer(args, self.text_predictor)
|
||||||
|
for param in self.text_predictor.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
if self.embedding_manager:
|
||||||
|
self.embedding_manager.recog = self.cn_recognizer
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def get_input(self, batch, k, bs=None, *args, **kwargs):
|
||||||
|
if self.embedding_manager is None: # fill in full caption
|
||||||
|
self.fill_caption(batch)
|
||||||
|
x, c, mx = super().get_input(batch, self.first_stage_key, mask_k='masked_img', *args, **kwargs)
|
||||||
|
control = batch[self.control_key] # for log_images and loss_alpha, not real control
|
||||||
|
if bs is not None:
|
||||||
|
control = control[:bs]
|
||||||
|
control = control.to(self.device)
|
||||||
|
control = einops.rearrange(control, 'b h w c -> b c h w')
|
||||||
|
control = control.to(memory_format=torch.contiguous_format).float()
|
||||||
|
|
||||||
|
inv_mask = batch['inv_mask']
|
||||||
|
if bs is not None:
|
||||||
|
inv_mask = inv_mask[:bs]
|
||||||
|
inv_mask = inv_mask.to(self.device)
|
||||||
|
inv_mask = einops.rearrange(inv_mask, 'b h w c -> b c h w')
|
||||||
|
inv_mask = inv_mask.to(memory_format=torch.contiguous_format).float()
|
||||||
|
|
||||||
|
glyphs = batch[self.glyph_key]
|
||||||
|
gly_line = batch['gly_line']
|
||||||
|
positions = batch[self.position_key]
|
||||||
|
n_lines = batch['n_lines']
|
||||||
|
language = batch['language']
|
||||||
|
texts = batch['texts']
|
||||||
|
assert len(glyphs) == len(positions)
|
||||||
|
for i in range(len(glyphs)):
|
||||||
|
if bs is not None:
|
||||||
|
glyphs[i] = glyphs[i][:bs]
|
||||||
|
gly_line[i] = gly_line[i][:bs]
|
||||||
|
positions[i] = positions[i][:bs]
|
||||||
|
n_lines = n_lines[:bs]
|
||||||
|
glyphs[i] = glyphs[i].to(self.device)
|
||||||
|
gly_line[i] = gly_line[i].to(self.device)
|
||||||
|
positions[i] = positions[i].to(self.device)
|
||||||
|
glyphs[i] = einops.rearrange(glyphs[i], 'b h w c -> b c h w')
|
||||||
|
gly_line[i] = einops.rearrange(gly_line[i], 'b h w c -> b c h w')
|
||||||
|
positions[i] = einops.rearrange(positions[i], 'b h w c -> b c h w')
|
||||||
|
glyphs[i] = glyphs[i].to(memory_format=torch.contiguous_format).float()
|
||||||
|
gly_line[i] = gly_line[i].to(memory_format=torch.contiguous_format).float()
|
||||||
|
positions[i] = positions[i].to(memory_format=torch.contiguous_format).float()
|
||||||
|
info = {}
|
||||||
|
info['glyphs'] = glyphs
|
||||||
|
info['positions'] = positions
|
||||||
|
info['n_lines'] = n_lines
|
||||||
|
info['language'] = language
|
||||||
|
info['texts'] = texts
|
||||||
|
info['img'] = batch['img'] # nhwc, (-1,1)
|
||||||
|
info['masked_x'] = mx
|
||||||
|
info['gly_line'] = gly_line
|
||||||
|
info['inv_mask'] = inv_mask
|
||||||
|
return x, dict(c_crossattn=[c], c_concat=[control], text_info=info)
|
||||||
|
|
||||||
|
def apply_model(self, x_noisy, t, cond, *args, **kwargs):
|
||||||
|
assert isinstance(cond, dict)
|
||||||
|
diffusion_model = self.model.diffusion_model
|
||||||
|
_cond = torch.cat(cond['c_crossattn'], 1)
|
||||||
|
_hint = torch.cat(cond['c_concat'], 1)
|
||||||
|
if self.use_fp16:
|
||||||
|
x_noisy = x_noisy.half()
|
||||||
|
control = self.control_model(x=x_noisy, timesteps=t, context=_cond, hint=_hint, text_info=cond['text_info'])
|
||||||
|
control = [c * scale for c, scale in zip(control, self.control_scales)]
|
||||||
|
eps = diffusion_model(x=x_noisy, timesteps=t, context=_cond, control=control, only_mid_control=self.only_mid_control)
|
||||||
|
|
||||||
|
return eps
|
||||||
|
|
||||||
|
def instantiate_embedding_manager(self, config, embedder):
|
||||||
|
model = instantiate_from_config(config, embedder=embedder)
|
||||||
|
return model
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def get_unconditional_conditioning(self, N):
|
||||||
|
return self.get_learned_conditioning(dict(c_crossattn=[[""] * N], text_info=None))
|
||||||
|
|
||||||
|
def get_learned_conditioning(self, c):
|
||||||
|
if self.cond_stage_forward is None:
|
||||||
|
if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
|
||||||
|
if self.embedding_manager is not None and c['text_info'] is not None:
|
||||||
|
self.embedding_manager.encode_text(c['text_info'])
|
||||||
|
if isinstance(c, dict):
|
||||||
|
cond_txt = c['c_crossattn'][0]
|
||||||
|
else:
|
||||||
|
cond_txt = c
|
||||||
|
if self.embedding_manager is not None:
|
||||||
|
cond_txt = self.cond_stage_model.encode(cond_txt, embedding_manager=self.embedding_manager)
|
||||||
|
else:
|
||||||
|
cond_txt = self.cond_stage_model.encode(cond_txt)
|
||||||
|
if isinstance(c, dict):
|
||||||
|
c['c_crossattn'][0] = cond_txt
|
||||||
|
else:
|
||||||
|
c = cond_txt
|
||||||
|
if isinstance(c, DiagonalGaussianDistribution):
|
||||||
|
c = c.mode()
|
||||||
|
else:
|
||||||
|
c = self.cond_stage_model(c)
|
||||||
|
else:
|
||||||
|
assert hasattr(self.cond_stage_model, self.cond_stage_forward)
|
||||||
|
c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
|
||||||
|
return c
|
||||||
|
|
||||||
|
def fill_caption(self, batch, place_holder='*'):
|
||||||
|
bs = len(batch['n_lines'])
|
||||||
|
cond_list = copy.deepcopy(batch[self.cond_stage_key])
|
||||||
|
for i in range(bs):
|
||||||
|
n_lines = batch['n_lines'][i]
|
||||||
|
if n_lines == 0:
|
||||||
|
continue
|
||||||
|
cur_cap = cond_list[i]
|
||||||
|
for j in range(n_lines):
|
||||||
|
r_txt = batch['texts'][j][i]
|
||||||
|
cur_cap = cur_cap.replace(place_holder, f'"{r_txt}"', 1)
|
||||||
|
cond_list[i] = cur_cap
|
||||||
|
batch[self.cond_stage_key] = cond_list
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def log_images(self, batch, N=4, n_row=2, sample=False, ddim_steps=50, ddim_eta=0.0, return_keys=None,
|
||||||
|
quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
|
||||||
|
plot_diffusion_rows=False, unconditional_guidance_scale=9.0, unconditional_guidance_label=None,
|
||||||
|
use_ema_scope=True,
|
||||||
|
**kwargs):
|
||||||
|
use_ddim = ddim_steps is not None
|
||||||
|
|
||||||
|
log = dict()
|
||||||
|
z, c = self.get_input(batch, self.first_stage_key, bs=N)
|
||||||
|
if self.cond_stage_trainable:
|
||||||
|
with torch.no_grad():
|
||||||
|
c = self.get_learned_conditioning(c)
|
||||||
|
c_crossattn = c["c_crossattn"][0][:N]
|
||||||
|
c_cat = c["c_concat"][0][:N]
|
||||||
|
text_info = c["text_info"]
|
||||||
|
text_info['glyphs'] = [i[:N] for i in text_info['glyphs']]
|
||||||
|
text_info['gly_line'] = [i[:N] for i in text_info['gly_line']]
|
||||||
|
text_info['positions'] = [i[:N] for i in text_info['positions']]
|
||||||
|
text_info['n_lines'] = text_info['n_lines'][:N]
|
||||||
|
text_info['masked_x'] = text_info['masked_x'][:N]
|
||||||
|
text_info['img'] = text_info['img'][:N]
|
||||||
|
|
||||||
|
N = min(z.shape[0], N)
|
||||||
|
n_row = min(z.shape[0], n_row)
|
||||||
|
log["reconstruction"] = self.decode_first_stage(z)
|
||||||
|
log["masked_image"] = self.decode_first_stage(text_info['masked_x'])
|
||||||
|
log["control"] = c_cat * 2.0 - 1.0
|
||||||
|
log["img"] = text_info['img'].permute(0, 3, 1, 2) # log source image if needed
|
||||||
|
# get glyph
|
||||||
|
glyph_bs = torch.stack(text_info['glyphs'])
|
||||||
|
glyph_bs = torch.sum(glyph_bs, dim=0) * 2.0 - 1.0
|
||||||
|
log["glyph"] = torch.nn.functional.interpolate(glyph_bs, size=(512, 512), mode='bilinear', align_corners=True,)
|
||||||
|
# fill caption
|
||||||
|
if not self.embedding_manager:
|
||||||
|
self.fill_caption(batch)
|
||||||
|
captions = batch[self.cond_stage_key]
|
||||||
|
log["conditioning"] = log_txt_as_img((512, 512), captions, size=16)
|
||||||
|
|
||||||
|
if plot_diffusion_rows:
|
||||||
|
# get diffusion row
|
||||||
|
diffusion_row = list()
|
||||||
|
z_start = z[:n_row]
|
||||||
|
for t in range(self.num_timesteps):
|
||||||
|
if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
|
||||||
|
t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
|
||||||
|
t = t.to(self.device).long()
|
||||||
|
noise = torch.randn_like(z_start)
|
||||||
|
z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
|
||||||
|
diffusion_row.append(self.decode_first_stage(z_noisy))
|
||||||
|
|
||||||
|
diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W
|
||||||
|
diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
|
||||||
|
diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
|
||||||
|
diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
|
||||||
|
log["diffusion_row"] = diffusion_grid
|
||||||
|
|
||||||
|
if sample:
|
||||||
|
# get denoise row
|
||||||
|
samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c], "text_info": text_info},
|
||||||
|
batch_size=N, ddim=use_ddim,
|
||||||
|
ddim_steps=ddim_steps, eta=ddim_eta)
|
||||||
|
x_samples = self.decode_first_stage(samples)
|
||||||
|
log["samples"] = x_samples
|
||||||
|
if plot_denoise_rows:
|
||||||
|
denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
|
||||||
|
log["denoise_row"] = denoise_grid
|
||||||
|
|
||||||
|
if unconditional_guidance_scale > 1.0:
|
||||||
|
uc_cross = self.get_unconditional_conditioning(N)
|
||||||
|
uc_cat = c_cat # torch.zeros_like(c_cat)
|
||||||
|
uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross['c_crossattn'][0]], "text_info": text_info}
|
||||||
|
samples_cfg, tmps = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c_crossattn], "text_info": text_info},
|
||||||
|
batch_size=N, ddim=use_ddim,
|
||||||
|
ddim_steps=ddim_steps, eta=ddim_eta,
|
||||||
|
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||||
|
unconditional_conditioning=uc_full,
|
||||||
|
)
|
||||||
|
x_samples_cfg = self.decode_first_stage(samples_cfg)
|
||||||
|
log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
|
||||||
|
pred_x0 = False # wether log pred_x0
|
||||||
|
if pred_x0:
|
||||||
|
for idx in range(len(tmps['pred_x0'])):
|
||||||
|
pred_x0 = self.decode_first_stage(tmps['pred_x0'][idx])
|
||||||
|
log[f"pred_x0_{tmps['index'][idx]}"] = pred_x0
|
||||||
|
|
||||||
|
return log
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
|
||||||
|
ddim_sampler = DDIMSampler(self)
|
||||||
|
b, c, h, w = cond["c_concat"][0].shape
|
||||||
|
shape = (self.channels, h // 8, w // 8)
|
||||||
|
samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False, log_every_t=5, **kwargs)
|
||||||
|
return samples, intermediates
|
||||||
|
|
||||||
|
def configure_optimizers(self):
|
||||||
|
lr = self.learning_rate
|
||||||
|
params = list(self.control_model.parameters())
|
||||||
|
if self.embedding_manager:
|
||||||
|
params += list(self.embedding_manager.embedding_parameters())
|
||||||
|
if not self.sd_locked:
|
||||||
|
# params += list(self.model.diffusion_model.input_blocks.parameters())
|
||||||
|
# params += list(self.model.diffusion_model.middle_block.parameters())
|
||||||
|
params += list(self.model.diffusion_model.output_blocks.parameters())
|
||||||
|
params += list(self.model.diffusion_model.out.parameters())
|
||||||
|
if self.unlockKV:
|
||||||
|
nCount = 0
|
||||||
|
for name, param in self.model.diffusion_model.named_parameters():
|
||||||
|
if 'attn2.to_k' in name or 'attn2.to_v' in name:
|
||||||
|
params += [param]
|
||||||
|
nCount += 1
|
||||||
|
print(f'Cross attention is unlocked, and {nCount} Wk or Wv are added to potimizers!!!')
|
||||||
|
|
||||||
|
opt = torch.optim.AdamW(params, lr=lr)
|
||||||
|
return opt
|
||||||
|
|
||||||
|
def low_vram_shift(self, is_diffusing):
|
||||||
|
if is_diffusing:
|
||||||
|
self.model = self.model.cuda()
|
||||||
|
self.control_model = self.control_model.cuda()
|
||||||
|
self.first_stage_model = self.first_stage_model.cpu()
|
||||||
|
self.cond_stage_model = self.cond_stage_model.cpu()
|
||||||
|
else:
|
||||||
|
self.model = self.model.cpu()
|
||||||
|
self.control_model = self.control_model.cpu()
|
||||||
|
self.first_stage_model = self.first_stage_model.cuda()
|
||||||
|
self.cond_stage_model = self.cond_stage_model.cuda()
|
317
AnyText/AnyText_scripts/cldm/ddim_hacked.py
Normal file
@ -0,0 +1,317 @@
|
|||||||
|
"""SAMPLING ONLY."""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from ..ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, extract_into_tensor
|
||||||
|
|
||||||
|
|
||||||
|
class DDIMSampler(object):
|
||||||
|
def __init__(self, model, schedule="linear", **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.model = model
|
||||||
|
self.ddpm_num_timesteps = model.num_timesteps
|
||||||
|
self.schedule = schedule
|
||||||
|
|
||||||
|
def register_buffer(self, name, attr):
|
||||||
|
if type(attr) == torch.Tensor:
|
||||||
|
if attr.device != torch.device("cuda"):
|
||||||
|
attr = attr.to(torch.device("cuda"))
|
||||||
|
setattr(self, name, attr)
|
||||||
|
|
||||||
|
def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
|
||||||
|
self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
|
||||||
|
num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
|
||||||
|
alphas_cumprod = self.model.alphas_cumprod
|
||||||
|
assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
|
||||||
|
to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
|
||||||
|
|
||||||
|
self.register_buffer('betas', to_torch(self.model.betas))
|
||||||
|
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
|
||||||
|
self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
|
||||||
|
|
||||||
|
# calculations for diffusion q(x_t | x_{t-1}) and others
|
||||||
|
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
|
||||||
|
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
|
||||||
|
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
|
||||||
|
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
|
||||||
|
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
|
||||||
|
|
||||||
|
# ddim sampling parameters
|
||||||
|
ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
|
||||||
|
ddim_timesteps=self.ddim_timesteps,
|
||||||
|
eta=ddim_eta,verbose=verbose)
|
||||||
|
self.register_buffer('ddim_sigmas', ddim_sigmas)
|
||||||
|
self.register_buffer('ddim_alphas', ddim_alphas)
|
||||||
|
self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
|
||||||
|
self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
|
||||||
|
sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
|
||||||
|
(1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
|
||||||
|
1 - self.alphas_cumprod / self.alphas_cumprod_prev))
|
||||||
|
self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def sample(self,
|
||||||
|
S,
|
||||||
|
batch_size,
|
||||||
|
shape,
|
||||||
|
conditioning=None,
|
||||||
|
callback=None,
|
||||||
|
normals_sequence=None,
|
||||||
|
img_callback=None,
|
||||||
|
quantize_x0=False,
|
||||||
|
eta=0.,
|
||||||
|
mask=None,
|
||||||
|
x0=None,
|
||||||
|
temperature=1.,
|
||||||
|
noise_dropout=0.,
|
||||||
|
score_corrector=None,
|
||||||
|
corrector_kwargs=None,
|
||||||
|
verbose=True,
|
||||||
|
x_T=None,
|
||||||
|
log_every_t=100,
|
||||||
|
unconditional_guidance_scale=1.,
|
||||||
|
unconditional_conditioning=None, # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
|
||||||
|
dynamic_threshold=None,
|
||||||
|
ucg_schedule=None,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
if conditioning is not None:
|
||||||
|
if isinstance(conditioning, dict):
|
||||||
|
ctmp = conditioning[list(conditioning.keys())[0]]
|
||||||
|
while isinstance(ctmp, list): ctmp = ctmp[0]
|
||||||
|
cbs = ctmp.shape[0]
|
||||||
|
if cbs != batch_size:
|
||||||
|
print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
|
||||||
|
|
||||||
|
elif isinstance(conditioning, list):
|
||||||
|
for ctmp in conditioning:
|
||||||
|
if ctmp.shape[0] != batch_size:
|
||||||
|
print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
|
||||||
|
|
||||||
|
else:
|
||||||
|
if conditioning.shape[0] != batch_size:
|
||||||
|
print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
|
||||||
|
|
||||||
|
self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
|
||||||
|
# sampling
|
||||||
|
C, H, W = shape
|
||||||
|
size = (batch_size, C, H, W)
|
||||||
|
print(f'Data shape for DDIM sampling is {size}, eta {eta}')
|
||||||
|
|
||||||
|
samples, intermediates = self.ddim_sampling(conditioning, size,
|
||||||
|
callback=callback,
|
||||||
|
img_callback=img_callback,
|
||||||
|
quantize_denoised=quantize_x0,
|
||||||
|
mask=mask, x0=x0,
|
||||||
|
ddim_use_original_steps=False,
|
||||||
|
noise_dropout=noise_dropout,
|
||||||
|
temperature=temperature,
|
||||||
|
score_corrector=score_corrector,
|
||||||
|
corrector_kwargs=corrector_kwargs,
|
||||||
|
x_T=x_T,
|
||||||
|
log_every_t=log_every_t,
|
||||||
|
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||||
|
unconditional_conditioning=unconditional_conditioning,
|
||||||
|
dynamic_threshold=dynamic_threshold,
|
||||||
|
ucg_schedule=ucg_schedule
|
||||||
|
)
|
||||||
|
return samples, intermediates
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def ddim_sampling(self, cond, shape,
|
||||||
|
x_T=None, ddim_use_original_steps=False,
|
||||||
|
callback=None, timesteps=None, quantize_denoised=False,
|
||||||
|
mask=None, x0=None, img_callback=None, log_every_t=100,
|
||||||
|
temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
|
||||||
|
unconditional_guidance_scale=1., unconditional_conditioning=None, dynamic_threshold=None,
|
||||||
|
ucg_schedule=None):
|
||||||
|
device = self.model.betas.device
|
||||||
|
b = shape[0]
|
||||||
|
if x_T is None:
|
||||||
|
img = torch.randn(shape, device=device)
|
||||||
|
else:
|
||||||
|
img = x_T
|
||||||
|
|
||||||
|
if timesteps is None:
|
||||||
|
timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
|
||||||
|
elif timesteps is not None and not ddim_use_original_steps:
|
||||||
|
subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
|
||||||
|
timesteps = self.ddim_timesteps[:subset_end]
|
||||||
|
|
||||||
|
intermediates = {'x_inter': [img], 'pred_x0': [img]}
|
||||||
|
time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
|
||||||
|
total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
|
||||||
|
print(f"Running DDIM Sampling with {total_steps} timesteps")
|
||||||
|
|
||||||
|
iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
|
||||||
|
|
||||||
|
for i, step in enumerate(iterator):
|
||||||
|
index = total_steps - i - 1
|
||||||
|
ts = torch.full((b,), step, device=device, dtype=torch.long)
|
||||||
|
|
||||||
|
if mask is not None:
|
||||||
|
assert x0 is not None
|
||||||
|
img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
|
||||||
|
img = img_orig * mask + (1. - mask) * img
|
||||||
|
|
||||||
|
if ucg_schedule is not None:
|
||||||
|
assert len(ucg_schedule) == len(time_range)
|
||||||
|
unconditional_guidance_scale = ucg_schedule[i]
|
||||||
|
|
||||||
|
outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
|
||||||
|
quantize_denoised=quantize_denoised, temperature=temperature,
|
||||||
|
noise_dropout=noise_dropout, score_corrector=score_corrector,
|
||||||
|
corrector_kwargs=corrector_kwargs,
|
||||||
|
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||||
|
unconditional_conditioning=unconditional_conditioning,
|
||||||
|
dynamic_threshold=dynamic_threshold)
|
||||||
|
img, pred_x0 = outs
|
||||||
|
if callback: callback(i)
|
||||||
|
if img_callback: img_callback(pred_x0, i)
|
||||||
|
|
||||||
|
if index % log_every_t == 0 or index == total_steps - 1:
|
||||||
|
intermediates['x_inter'].append(img)
|
||||||
|
intermediates['pred_x0'].append(pred_x0)
|
||||||
|
|
||||||
|
return img, intermediates
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
|
||||||
|
temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
|
||||||
|
unconditional_guidance_scale=1., unconditional_conditioning=None,
|
||||||
|
dynamic_threshold=None):
|
||||||
|
b, *_, device = *x.shape, x.device
|
||||||
|
|
||||||
|
if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
|
||||||
|
model_output = self.model.apply_model(x, t, c)
|
||||||
|
else:
|
||||||
|
model_t = self.model.apply_model(x, t, c)
|
||||||
|
model_uncond = self.model.apply_model(x, t, unconditional_conditioning)
|
||||||
|
model_output = model_uncond + unconditional_guidance_scale * (model_t - model_uncond)
|
||||||
|
|
||||||
|
if self.model.parameterization == "v":
|
||||||
|
e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
|
||||||
|
else:
|
||||||
|
e_t = model_output
|
||||||
|
|
||||||
|
if score_corrector is not None:
|
||||||
|
assert self.model.parameterization == "eps", 'not implemented'
|
||||||
|
e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
|
||||||
|
|
||||||
|
alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
|
||||||
|
alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
|
||||||
|
sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
|
||||||
|
sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
|
||||||
|
# select parameters corresponding to the currently considered timestep
|
||||||
|
a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
|
||||||
|
a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
|
||||||
|
sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
|
||||||
|
sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
|
||||||
|
|
||||||
|
# current prediction for x_0
|
||||||
|
if self.model.parameterization != "v":
|
||||||
|
pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
|
||||||
|
else:
|
||||||
|
pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
|
||||||
|
|
||||||
|
if quantize_denoised:
|
||||||
|
pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
|
||||||
|
|
||||||
|
if dynamic_threshold is not None:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
# direction pointing to x_t
|
||||||
|
dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
|
||||||
|
noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
|
||||||
|
if noise_dropout > 0.:
|
||||||
|
noise = torch.nn.functional.dropout(noise, p=noise_dropout)
|
||||||
|
x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
|
||||||
|
return x_prev, pred_x0
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def encode(self, x0, c, t_enc, use_original_steps=False, return_intermediates=None,
|
||||||
|
unconditional_guidance_scale=1.0, unconditional_conditioning=None, callback=None):
|
||||||
|
timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
|
||||||
|
num_reference_steps = timesteps.shape[0]
|
||||||
|
|
||||||
|
assert t_enc <= num_reference_steps
|
||||||
|
num_steps = t_enc
|
||||||
|
|
||||||
|
if use_original_steps:
|
||||||
|
alphas_next = self.alphas_cumprod[:num_steps]
|
||||||
|
alphas = self.alphas_cumprod_prev[:num_steps]
|
||||||
|
else:
|
||||||
|
alphas_next = self.ddim_alphas[:num_steps]
|
||||||
|
alphas = torch.tensor(self.ddim_alphas_prev[:num_steps])
|
||||||
|
|
||||||
|
x_next = x0
|
||||||
|
intermediates = []
|
||||||
|
inter_steps = []
|
||||||
|
for i in tqdm(range(num_steps), desc='Encoding Image'):
|
||||||
|
t = torch.full((x0.shape[0],), timesteps[i], device=self.model.device, dtype=torch.long)
|
||||||
|
if unconditional_guidance_scale == 1.:
|
||||||
|
noise_pred = self.model.apply_model(x_next, t, c)
|
||||||
|
else:
|
||||||
|
assert unconditional_conditioning is not None
|
||||||
|
e_t_uncond, noise_pred = torch.chunk(
|
||||||
|
self.model.apply_model(torch.cat((x_next, x_next)), torch.cat((t, t)),
|
||||||
|
torch.cat((unconditional_conditioning, c))), 2)
|
||||||
|
noise_pred = e_t_uncond + unconditional_guidance_scale * (noise_pred - e_t_uncond)
|
||||||
|
|
||||||
|
xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next
|
||||||
|
weighted_noise_pred = alphas_next[i].sqrt() * (
|
||||||
|
(1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt()) * noise_pred
|
||||||
|
x_next = xt_weighted + weighted_noise_pred
|
||||||
|
if return_intermediates and i % (
|
||||||
|
num_steps // return_intermediates) == 0 and i < num_steps - 1:
|
||||||
|
intermediates.append(x_next)
|
||||||
|
inter_steps.append(i)
|
||||||
|
elif return_intermediates and i >= num_steps - 2:
|
||||||
|
intermediates.append(x_next)
|
||||||
|
inter_steps.append(i)
|
||||||
|
if callback: callback(i)
|
||||||
|
|
||||||
|
out = {'x_encoded': x_next, 'intermediate_steps': inter_steps}
|
||||||
|
if return_intermediates:
|
||||||
|
out.update({'intermediates': intermediates})
|
||||||
|
return x_next, out
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
|
||||||
|
# fast, but does not allow for exact reconstruction
|
||||||
|
# t serves as an index to gather the correct alphas
|
||||||
|
if use_original_steps:
|
||||||
|
sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
|
||||||
|
sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
|
||||||
|
else:
|
||||||
|
sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
|
||||||
|
sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
|
||||||
|
|
||||||
|
if noise is None:
|
||||||
|
noise = torch.randn_like(x0)
|
||||||
|
return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
|
||||||
|
extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
|
||||||
|
use_original_steps=False, callback=None):
|
||||||
|
|
||||||
|
timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
|
||||||
|
timesteps = timesteps[:t_start]
|
||||||
|
|
||||||
|
time_range = np.flip(timesteps)
|
||||||
|
total_steps = timesteps.shape[0]
|
||||||
|
print(f"Running DDIM Sampling with {total_steps} timesteps")
|
||||||
|
|
||||||
|
iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
|
||||||
|
x_dec = x_latent
|
||||||
|
for i, step in enumerate(iterator):
|
||||||
|
index = total_steps - i - 1
|
||||||
|
ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
|
||||||
|
x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
|
||||||
|
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||||
|
unconditional_conditioning=unconditional_conditioning)
|
||||||
|
if callback: callback(i)
|
||||||
|
return x_dec
|
168
AnyText/AnyText_scripts/cldm/embedding_manager.py
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
'''
|
||||||
|
Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
'''
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from functools import partial
|
||||||
|
from ..ldm.modules.diffusionmodules.util import conv_nd, linear, zero_module
|
||||||
|
|
||||||
|
|
||||||
|
def get_clip_token_for_string(tokenizer, string):
|
||||||
|
batch_encoding = tokenizer(string, truncation=True, max_length=77, return_length=True,
|
||||||
|
return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
|
||||||
|
tokens = batch_encoding["input_ids"]
|
||||||
|
assert torch.count_nonzero(tokens - 49407) == 2, f"String '{string}' maps to more than a single token. Please use another string"
|
||||||
|
return tokens[0, 1]
|
||||||
|
|
||||||
|
|
||||||
|
def get_bert_token_for_string(tokenizer, string):
|
||||||
|
token = tokenizer(string)
|
||||||
|
assert torch.count_nonzero(token) == 3, f"String '{string}' maps to more than a single token. Please use another string"
|
||||||
|
token = token[0, 1]
|
||||||
|
return token
|
||||||
|
|
||||||
|
|
||||||
|
def get_clip_vision_emb(encoder, processor, img):
|
||||||
|
_img = img.repeat(1, 3, 1, 1)*255
|
||||||
|
inputs = processor(images=_img, return_tensors="pt")
|
||||||
|
inputs['pixel_values'] = inputs['pixel_values'].to(img.device)
|
||||||
|
outputs = encoder(**inputs)
|
||||||
|
emb = outputs.image_embeds
|
||||||
|
return emb
|
||||||
|
|
||||||
|
|
||||||
|
def get_recog_emb(encoder, img_list):
|
||||||
|
_img_list = [(img.repeat(1, 3, 1, 1)*255)[0] for img in img_list]
|
||||||
|
encoder.predictor.eval()
|
||||||
|
_, preds_neck = encoder.pred_imglist(_img_list, show_debug=False)
|
||||||
|
return preds_neck
|
||||||
|
|
||||||
|
|
||||||
|
def pad_H(x):
|
||||||
|
_, _, H, W = x.shape
|
||||||
|
p_top = (W - H) // 2
|
||||||
|
p_bot = W - H - p_top
|
||||||
|
return F.pad(x, (0, 0, p_top, p_bot))
|
||||||
|
|
||||||
|
|
||||||
|
class EncodeNet(nn.Module):
|
||||||
|
def __init__(self, in_channels, out_channels):
|
||||||
|
super(EncodeNet, self).__init__()
|
||||||
|
chan = 16
|
||||||
|
n_layer = 4 # downsample
|
||||||
|
|
||||||
|
self.conv1 = conv_nd(2, in_channels, chan, 3, padding=1)
|
||||||
|
self.conv_list = nn.ModuleList([])
|
||||||
|
_c = chan
|
||||||
|
for i in range(n_layer):
|
||||||
|
self.conv_list.append(conv_nd(2, _c, _c*2, 3, padding=1, stride=2))
|
||||||
|
_c *= 2
|
||||||
|
self.conv2 = conv_nd(2, _c, out_channels, 3, padding=1)
|
||||||
|
self.avgpool = nn.AdaptiveAvgPool2d(1)
|
||||||
|
self.act = nn.SiLU()
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.act(self.conv1(x))
|
||||||
|
for layer in self.conv_list:
|
||||||
|
x = self.act(layer(x))
|
||||||
|
x = self.act(self.conv2(x))
|
||||||
|
x = self.avgpool(x)
|
||||||
|
x = x.view(x.size(0), -1)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddingManager(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedder,
|
||||||
|
valid=True,
|
||||||
|
glyph_channels=20,
|
||||||
|
position_channels=1,
|
||||||
|
placeholder_string='*',
|
||||||
|
add_pos=False,
|
||||||
|
emb_type='ocr',
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
if hasattr(embedder, 'tokenizer'): # using Stable Diffusion's CLIP encoder
|
||||||
|
get_token_for_string = partial(get_clip_token_for_string, embedder.tokenizer)
|
||||||
|
token_dim = 768
|
||||||
|
if hasattr(embedder, 'vit'):
|
||||||
|
assert emb_type == 'vit'
|
||||||
|
self.get_vision_emb = partial(get_clip_vision_emb, embedder.vit, embedder.processor)
|
||||||
|
self.get_recog_emb = None
|
||||||
|
else: # using LDM's BERT encoder
|
||||||
|
get_token_for_string = partial(get_bert_token_for_string, embedder.tknz_fn)
|
||||||
|
token_dim = 1280
|
||||||
|
self.token_dim = token_dim
|
||||||
|
self.emb_type = emb_type
|
||||||
|
|
||||||
|
self.add_pos = add_pos
|
||||||
|
if add_pos:
|
||||||
|
self.position_encoder = EncodeNet(position_channels, token_dim)
|
||||||
|
if emb_type == 'ocr':
|
||||||
|
self.proj = nn.Sequential(
|
||||||
|
zero_module(linear(40*64, token_dim)),
|
||||||
|
nn.LayerNorm(token_dim)
|
||||||
|
)
|
||||||
|
if emb_type == 'conv':
|
||||||
|
self.glyph_encoder = EncodeNet(glyph_channels, token_dim)
|
||||||
|
|
||||||
|
self.placeholder_token = get_token_for_string(placeholder_string)
|
||||||
|
|
||||||
|
def encode_text(self, text_info):
|
||||||
|
if self.get_recog_emb is None and self.emb_type == 'ocr':
|
||||||
|
self.get_recog_emb = partial(get_recog_emb, self.recog)
|
||||||
|
|
||||||
|
gline_list = []
|
||||||
|
pos_list = []
|
||||||
|
for i in range(len(text_info['n_lines'])): # sample index in a batch
|
||||||
|
n_lines = text_info['n_lines'][i]
|
||||||
|
for j in range(n_lines): # line
|
||||||
|
gline_list += [text_info['gly_line'][j][i:i+1]]
|
||||||
|
if self.add_pos:
|
||||||
|
pos_list += [text_info['positions'][j][i:i+1]]
|
||||||
|
|
||||||
|
if len(gline_list) > 0:
|
||||||
|
if self.emb_type == 'ocr':
|
||||||
|
recog_emb = self.get_recog_emb(gline_list)
|
||||||
|
enc_glyph = self.proj(recog_emb.reshape(recog_emb.shape[0], -1))
|
||||||
|
elif self.emb_type == 'vit':
|
||||||
|
enc_glyph = self.get_vision_emb(pad_H(torch.cat(gline_list, dim=0)))
|
||||||
|
elif self.emb_type == 'conv':
|
||||||
|
enc_glyph = self.glyph_encoder(pad_H(torch.cat(gline_list, dim=0)))
|
||||||
|
if self.add_pos:
|
||||||
|
enc_pos = self.position_encoder(torch.cat(gline_list, dim=0))
|
||||||
|
enc_glyph = enc_glyph+enc_pos
|
||||||
|
|
||||||
|
self.text_embs_all = []
|
||||||
|
n_idx = 0
|
||||||
|
for i in range(len(text_info['n_lines'])): # sample index in a batch
|
||||||
|
n_lines = text_info['n_lines'][i]
|
||||||
|
text_embs = []
|
||||||
|
for j in range(n_lines): # line
|
||||||
|
text_embs += [enc_glyph[n_idx:n_idx+1]]
|
||||||
|
n_idx += 1
|
||||||
|
self.text_embs_all += [text_embs]
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
tokenized_text,
|
||||||
|
embedded_text,
|
||||||
|
):
|
||||||
|
b, device = tokenized_text.shape[0], tokenized_text.device
|
||||||
|
for i in range(b):
|
||||||
|
idx = tokenized_text[i] == self.placeholder_token.to(device)
|
||||||
|
if sum(idx) > 0:
|
||||||
|
if i >= len(self.text_embs_all):
|
||||||
|
print('truncation for log images...')
|
||||||
|
break
|
||||||
|
text_emb = torch.cat(self.text_embs_all[i], dim=0)
|
||||||
|
if sum(idx) != len(text_emb):
|
||||||
|
print('truncation for long caption...')
|
||||||
|
embedded_text[i][idx] = text_emb[:sum(idx)]
|
||||||
|
return embedded_text
|
||||||
|
|
||||||
|
def embedding_parameters(self):
|
||||||
|
return self.parameters()
|
111
AnyText/AnyText_scripts/cldm/hack.py
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
import torch
|
||||||
|
import einops
|
||||||
|
|
||||||
|
from ..ldm.modules.encoders import modules
|
||||||
|
from ..ldm.modules import attention
|
||||||
|
|
||||||
|
from transformers import logging
|
||||||
|
from ..ldm.modules.attention import default
|
||||||
|
|
||||||
|
|
||||||
|
def disable_verbosity():
|
||||||
|
logging.set_verbosity_error()
|
||||||
|
print('logging improved.')
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def enable_sliced_attention():
|
||||||
|
attention.CrossAttention.forward = _hacked_sliced_attentin_forward
|
||||||
|
print('Enabled sliced_attention.')
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def hack_everything(clip_skip=0):
|
||||||
|
disable_verbosity()
|
||||||
|
modules.FrozenCLIPEmbedder.forward = _hacked_clip_forward
|
||||||
|
modules.FrozenCLIPEmbedder.clip_skip = clip_skip
|
||||||
|
print('Enabled clip hacks.')
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
# Written by Lvmin
|
||||||
|
def _hacked_clip_forward(self, text):
|
||||||
|
PAD = self.tokenizer.pad_token_id
|
||||||
|
EOS = self.tokenizer.eos_token_id
|
||||||
|
BOS = self.tokenizer.bos_token_id
|
||||||
|
|
||||||
|
def tokenize(t):
|
||||||
|
return self.tokenizer(t, truncation=False, add_special_tokens=False)["input_ids"]
|
||||||
|
|
||||||
|
def transformer_encode(t):
|
||||||
|
if self.clip_skip > 1:
|
||||||
|
rt = self.transformer(input_ids=t, output_hidden_states=True)
|
||||||
|
return self.transformer.text_model.final_layer_norm(rt.hidden_states[-self.clip_skip])
|
||||||
|
else:
|
||||||
|
return self.transformer(input_ids=t, output_hidden_states=False).last_hidden_state
|
||||||
|
|
||||||
|
def split(x):
|
||||||
|
return x[75 * 0: 75 * 1], x[75 * 1: 75 * 2], x[75 * 2: 75 * 3]
|
||||||
|
|
||||||
|
def pad(x, p, i):
|
||||||
|
return x[:i] if len(x) >= i else x + [p] * (i - len(x))
|
||||||
|
|
||||||
|
raw_tokens_list = tokenize(text)
|
||||||
|
tokens_list = []
|
||||||
|
|
||||||
|
for raw_tokens in raw_tokens_list:
|
||||||
|
raw_tokens_123 = split(raw_tokens)
|
||||||
|
raw_tokens_123 = [[BOS] + raw_tokens_i + [EOS] for raw_tokens_i in raw_tokens_123]
|
||||||
|
raw_tokens_123 = [pad(raw_tokens_i, PAD, 77) for raw_tokens_i in raw_tokens_123]
|
||||||
|
tokens_list.append(raw_tokens_123)
|
||||||
|
|
||||||
|
tokens_list = torch.IntTensor(tokens_list).to(self.device)
|
||||||
|
|
||||||
|
feed = einops.rearrange(tokens_list, 'b f i -> (b f) i')
|
||||||
|
y = transformer_encode(feed)
|
||||||
|
z = einops.rearrange(y, '(b f) i c -> b (f i) c', f=3)
|
||||||
|
|
||||||
|
return z
|
||||||
|
|
||||||
|
|
||||||
|
# Stolen from https://github.com/basujindal/stable-diffusion/blob/main/optimizedSD/splitAttention.py
|
||||||
|
def _hacked_sliced_attentin_forward(self, x, context=None, mask=None):
|
||||||
|
h = self.heads
|
||||||
|
|
||||||
|
q = self.to_q(x)
|
||||||
|
context = default(context, x)
|
||||||
|
k = self.to_k(context)
|
||||||
|
v = self.to_v(context)
|
||||||
|
del context, x
|
||||||
|
|
||||||
|
q, k, v = map(lambda t: einops.rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
|
||||||
|
|
||||||
|
limit = k.shape[0]
|
||||||
|
att_step = 1
|
||||||
|
q_chunks = list(torch.tensor_split(q, limit // att_step, dim=0))
|
||||||
|
k_chunks = list(torch.tensor_split(k, limit // att_step, dim=0))
|
||||||
|
v_chunks = list(torch.tensor_split(v, limit // att_step, dim=0))
|
||||||
|
|
||||||
|
q_chunks.reverse()
|
||||||
|
k_chunks.reverse()
|
||||||
|
v_chunks.reverse()
|
||||||
|
sim = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device)
|
||||||
|
del k, q, v
|
||||||
|
for i in range(0, limit, att_step):
|
||||||
|
q_buffer = q_chunks.pop()
|
||||||
|
k_buffer = k_chunks.pop()
|
||||||
|
v_buffer = v_chunks.pop()
|
||||||
|
sim_buffer = torch.einsum('b i d, b j d -> b i j', q_buffer, k_buffer) * self.scale
|
||||||
|
|
||||||
|
del k_buffer, q_buffer
|
||||||
|
# attention, what we cannot get enough of, by chunks
|
||||||
|
|
||||||
|
sim_buffer = sim_buffer.softmax(dim=-1)
|
||||||
|
|
||||||
|
sim_buffer = torch.einsum('b i j, b j d -> b i d', sim_buffer, v_buffer)
|
||||||
|
del v_buffer
|
||||||
|
sim[i:i + att_step, :, :] = sim_buffer
|
||||||
|
|
||||||
|
del sim_buffer
|
||||||
|
sim = einops.rearrange(sim, '(b h) n d -> b n (h d)', h=h)
|
||||||
|
return self.to_out(sim)
|
76
AnyText/AnyText_scripts/cldm/logger.py
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torchvision
|
||||||
|
from PIL import Image
|
||||||
|
from pytorch_lightning.callbacks import Callback
|
||||||
|
from pytorch_lightning.utilities.rank_zero import rank_zero_only
|
||||||
|
|
||||||
|
|
||||||
|
class ImageLogger(Callback):
|
||||||
|
def __init__(self, batch_frequency=2000, max_images=4, clamp=True, increase_log_steps=True,
|
||||||
|
rescale=True, disabled=False, log_on_batch_idx=False, log_first_step=False,
|
||||||
|
log_images_kwargs=None):
|
||||||
|
super().__init__()
|
||||||
|
self.rescale = rescale
|
||||||
|
self.batch_freq = batch_frequency
|
||||||
|
self.max_images = max_images
|
||||||
|
if not increase_log_steps:
|
||||||
|
self.log_steps = [self.batch_freq]
|
||||||
|
self.clamp = clamp
|
||||||
|
self.disabled = disabled
|
||||||
|
self.log_on_batch_idx = log_on_batch_idx
|
||||||
|
self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {}
|
||||||
|
self.log_first_step = log_first_step
|
||||||
|
|
||||||
|
@rank_zero_only
|
||||||
|
def log_local(self, save_dir, split, images, global_step, current_epoch, batch_idx):
|
||||||
|
root = os.path.join(save_dir, "image_log", split)
|
||||||
|
for k in images:
|
||||||
|
grid = torchvision.utils.make_grid(images[k], nrow=4)
|
||||||
|
if self.rescale:
|
||||||
|
grid = (grid + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w
|
||||||
|
grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
|
||||||
|
grid = grid.numpy()
|
||||||
|
grid = (grid * 255).astype(np.uint8)
|
||||||
|
filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format(k, global_step, current_epoch, batch_idx)
|
||||||
|
path = os.path.join(root, filename)
|
||||||
|
os.makedirs(os.path.split(path)[0], exist_ok=True)
|
||||||
|
Image.fromarray(grid).save(path)
|
||||||
|
|
||||||
|
def log_img(self, pl_module, batch, batch_idx, split="train"):
|
||||||
|
check_idx = batch_idx # if self.log_on_batch_idx else pl_module.global_step
|
||||||
|
if (self.check_frequency(check_idx) and # batch_idx % self.batch_freq == 0
|
||||||
|
hasattr(pl_module, "log_images") and
|
||||||
|
callable(pl_module.log_images) and
|
||||||
|
self.max_images > 0):
|
||||||
|
logger = type(pl_module.logger)
|
||||||
|
|
||||||
|
is_train = pl_module.training
|
||||||
|
if is_train:
|
||||||
|
pl_module.eval()
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
images = pl_module.log_images(batch, split=split, **self.log_images_kwargs)
|
||||||
|
|
||||||
|
for k in images:
|
||||||
|
N = min(images[k].shape[0], self.max_images)
|
||||||
|
images[k] = images[k][:N]
|
||||||
|
if isinstance(images[k], torch.Tensor):
|
||||||
|
images[k] = images[k].detach().cpu()
|
||||||
|
if self.clamp:
|
||||||
|
images[k] = torch.clamp(images[k], -1., 1.)
|
||||||
|
|
||||||
|
self.log_local(pl_module.logger.save_dir, split, images,
|
||||||
|
pl_module.global_step, pl_module.current_epoch, batch_idx)
|
||||||
|
|
||||||
|
if is_train:
|
||||||
|
pl_module.train()
|
||||||
|
|
||||||
|
def check_frequency(self, check_idx):
|
||||||
|
return check_idx % self.batch_freq == 0
|
||||||
|
|
||||||
|
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
|
||||||
|
if not self.disabled:
|
||||||
|
self.log_img(pl_module, batch, batch_idx, split="train")
|
34
AnyText/AnyText_scripts/cldm/model.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import os
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from omegaconf import OmegaConf
|
||||||
|
from ..ldm.util import instantiate_from_config
|
||||||
|
|
||||||
|
|
||||||
|
def get_state_dict(d):
|
||||||
|
return d.get('state_dict', d)
|
||||||
|
|
||||||
|
|
||||||
|
def load_state_dict(ckpt_path, location='cpu'):
|
||||||
|
_, extension = os.path.splitext(ckpt_path)
|
||||||
|
if extension.lower() == ".safetensors":
|
||||||
|
import safetensors.torch
|
||||||
|
state_dict = safetensors.torch.load_file(ckpt_path, device=location)
|
||||||
|
else:
|
||||||
|
state_dict = get_state_dict(torch.load(ckpt_path, map_location=torch.device(location)))
|
||||||
|
state_dict = get_state_dict(state_dict)
|
||||||
|
print(f'Loaded state_dict from [{ckpt_path}]')
|
||||||
|
return state_dict
|
||||||
|
|
||||||
|
|
||||||
|
def create_model(config_path, cond_stage_path=None, use_fp16=False):
|
||||||
|
config = OmegaConf.load(config_path)
|
||||||
|
if cond_stage_path:
|
||||||
|
config.model.params.cond_stage_config.params.version = cond_stage_path # use pre-downloaded ckpts, in case blocked
|
||||||
|
if use_fp16:
|
||||||
|
config.model.params.use_fp16 = True
|
||||||
|
config.model.params.control_stage_config.params.use_fp16 = True
|
||||||
|
config.model.params.unet_config.params.use_fp16 = True
|
||||||
|
model = instantiate_from_config(config.model).cpu()
|
||||||
|
print(f'Loaded model config from [{config_path}]')
|
||||||
|
return model
|
210
AnyText/AnyText_scripts/cldm/ocr_recog/RNN.py
Normal file
@ -0,0 +1,210 @@
|
|||||||
|
from torch import nn
|
||||||
|
import torch
|
||||||
|
from .RecSVTR import Block
|
||||||
|
|
||||||
|
class Swish(nn.Module):
|
||||||
|
def __int__(self):
|
||||||
|
super(Swish, self).__int__()
|
||||||
|
|
||||||
|
def forward(self,x):
|
||||||
|
return x*torch.sigmoid(x)
|
||||||
|
|
||||||
|
class Im2Im(nn.Module):
|
||||||
|
def __init__(self, in_channels, **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.out_channels = in_channels
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return x
|
||||||
|
|
||||||
|
class Im2Seq(nn.Module):
|
||||||
|
def __init__(self, in_channels, **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.out_channels = in_channels
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
B, C, H, W = x.shape
|
||||||
|
# assert H == 1
|
||||||
|
x = x.reshape(B, C, H * W)
|
||||||
|
x = x.permute((0, 2, 1))
|
||||||
|
return x
|
||||||
|
|
||||||
|
class EncoderWithRNN(nn.Module):
|
||||||
|
def __init__(self, in_channels,**kwargs):
|
||||||
|
super(EncoderWithRNN, self).__init__()
|
||||||
|
hidden_size = kwargs.get('hidden_size', 256)
|
||||||
|
self.out_channels = hidden_size * 2
|
||||||
|
self.lstm = nn.LSTM(in_channels, hidden_size, bidirectional=True, num_layers=2,batch_first=True)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
self.lstm.flatten_parameters()
|
||||||
|
x, _ = self.lstm(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
class SequenceEncoder(nn.Module):
|
||||||
|
def __init__(self, in_channels, encoder_type='rnn', **kwargs):
|
||||||
|
super(SequenceEncoder, self).__init__()
|
||||||
|
self.encoder_reshape = Im2Seq(in_channels)
|
||||||
|
self.out_channels = self.encoder_reshape.out_channels
|
||||||
|
self.encoder_type = encoder_type
|
||||||
|
if encoder_type == 'reshape':
|
||||||
|
self.only_reshape = True
|
||||||
|
else:
|
||||||
|
support_encoder_dict = {
|
||||||
|
'reshape': Im2Seq,
|
||||||
|
'rnn': EncoderWithRNN,
|
||||||
|
'svtr': EncoderWithSVTR
|
||||||
|
}
|
||||||
|
assert encoder_type in support_encoder_dict, '{} must in {}'.format(
|
||||||
|
encoder_type, support_encoder_dict.keys())
|
||||||
|
|
||||||
|
self.encoder = support_encoder_dict[encoder_type](
|
||||||
|
self.encoder_reshape.out_channels,**kwargs)
|
||||||
|
self.out_channels = self.encoder.out_channels
|
||||||
|
self.only_reshape = False
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.encoder_type != 'svtr':
|
||||||
|
x = self.encoder_reshape(x)
|
||||||
|
if not self.only_reshape:
|
||||||
|
x = self.encoder(x)
|
||||||
|
return x
|
||||||
|
else:
|
||||||
|
x = self.encoder(x)
|
||||||
|
x = self.encoder_reshape(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
class ConvBNLayer(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias_attr=False,
|
||||||
|
groups=1,
|
||||||
|
act=nn.GELU):
|
||||||
|
super().__init__()
|
||||||
|
self.conv = nn.Conv2d(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
kernel_size=kernel_size,
|
||||||
|
stride=stride,
|
||||||
|
padding=padding,
|
||||||
|
groups=groups,
|
||||||
|
# weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()),
|
||||||
|
bias=bias_attr)
|
||||||
|
self.norm = nn.BatchNorm2d(out_channels)
|
||||||
|
self.act = Swish()
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
out = self.conv(inputs)
|
||||||
|
out = self.norm(out)
|
||||||
|
out = self.act(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class EncoderWithSVTR(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_channels,
|
||||||
|
dims=64, # XS
|
||||||
|
depth=2,
|
||||||
|
hidden_dims=120,
|
||||||
|
use_guide=False,
|
||||||
|
num_heads=8,
|
||||||
|
qkv_bias=True,
|
||||||
|
mlp_ratio=2.0,
|
||||||
|
drop_rate=0.1,
|
||||||
|
attn_drop_rate=0.1,
|
||||||
|
drop_path=0.,
|
||||||
|
qk_scale=None):
|
||||||
|
super(EncoderWithSVTR, self).__init__()
|
||||||
|
self.depth = depth
|
||||||
|
self.use_guide = use_guide
|
||||||
|
self.conv1 = ConvBNLayer(
|
||||||
|
in_channels, in_channels // 8, padding=1, act='swish')
|
||||||
|
self.conv2 = ConvBNLayer(
|
||||||
|
in_channels // 8, hidden_dims, kernel_size=1, act='swish')
|
||||||
|
|
||||||
|
self.svtr_block = nn.ModuleList([
|
||||||
|
Block(
|
||||||
|
dim=hidden_dims,
|
||||||
|
num_heads=num_heads,
|
||||||
|
mixer='Global',
|
||||||
|
HW=None,
|
||||||
|
mlp_ratio=mlp_ratio,
|
||||||
|
qkv_bias=qkv_bias,
|
||||||
|
qk_scale=qk_scale,
|
||||||
|
drop=drop_rate,
|
||||||
|
act_layer='swish',
|
||||||
|
attn_drop=attn_drop_rate,
|
||||||
|
drop_path=drop_path,
|
||||||
|
norm_layer='nn.LayerNorm',
|
||||||
|
epsilon=1e-05,
|
||||||
|
prenorm=False) for i in range(depth)
|
||||||
|
])
|
||||||
|
self.norm = nn.LayerNorm(hidden_dims, eps=1e-6)
|
||||||
|
self.conv3 = ConvBNLayer(
|
||||||
|
hidden_dims, in_channels, kernel_size=1, act='swish')
|
||||||
|
# last conv-nxn, the input is concat of input tensor and conv3 output tensor
|
||||||
|
self.conv4 = ConvBNLayer(
|
||||||
|
2 * in_channels, in_channels // 8, padding=1, act='swish')
|
||||||
|
|
||||||
|
self.conv1x1 = ConvBNLayer(
|
||||||
|
in_channels // 8, dims, kernel_size=1, act='swish')
|
||||||
|
self.out_channels = dims
|
||||||
|
self.apply(self._init_weights)
|
||||||
|
|
||||||
|
def _init_weights(self, m):
|
||||||
|
# weight initialization
|
||||||
|
if isinstance(m, nn.Conv2d):
|
||||||
|
nn.init.kaiming_normal_(m.weight, mode='fan_out')
|
||||||
|
if m.bias is not None:
|
||||||
|
nn.init.zeros_(m.bias)
|
||||||
|
elif isinstance(m, nn.BatchNorm2d):
|
||||||
|
nn.init.ones_(m.weight)
|
||||||
|
nn.init.zeros_(m.bias)
|
||||||
|
elif isinstance(m, nn.Linear):
|
||||||
|
nn.init.normal_(m.weight, 0, 0.01)
|
||||||
|
if m.bias is not None:
|
||||||
|
nn.init.zeros_(m.bias)
|
||||||
|
elif isinstance(m, nn.ConvTranspose2d):
|
||||||
|
nn.init.kaiming_normal_(m.weight, mode='fan_out')
|
||||||
|
if m.bias is not None:
|
||||||
|
nn.init.zeros_(m.bias)
|
||||||
|
elif isinstance(m, nn.LayerNorm):
|
||||||
|
nn.init.ones_(m.weight)
|
||||||
|
nn.init.zeros_(m.bias)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# for use guide
|
||||||
|
if self.use_guide:
|
||||||
|
z = x.clone()
|
||||||
|
z.stop_gradient = True
|
||||||
|
else:
|
||||||
|
z = x
|
||||||
|
# for short cut
|
||||||
|
h = z
|
||||||
|
# reduce dim
|
||||||
|
z = self.conv1(z)
|
||||||
|
z = self.conv2(z)
|
||||||
|
# SVTR global block
|
||||||
|
B, C, H, W = z.shape
|
||||||
|
z = z.flatten(2).permute(0, 2, 1)
|
||||||
|
|
||||||
|
for blk in self.svtr_block:
|
||||||
|
z = blk(z)
|
||||||
|
|
||||||
|
z = self.norm(z)
|
||||||
|
# last stage
|
||||||
|
z = z.reshape([-1, H, W, C]).permute(0, 3, 1, 2)
|
||||||
|
z = self.conv3(z)
|
||||||
|
z = torch.cat((h, z), dim=1)
|
||||||
|
z = self.conv1x1(self.conv4(z))
|
||||||
|
|
||||||
|
return z
|
||||||
|
|
||||||
|
if __name__=="__main__":
|
||||||
|
svtrRNN = EncoderWithSVTR(56)
|
||||||
|
print(svtrRNN)
|
48
AnyText/AnyText_scripts/cldm/ocr_recog/RecCTCHead.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
from torch import nn
|
||||||
|
|
||||||
|
|
||||||
|
class CTCHead(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
out_channels=6625,
|
||||||
|
fc_decay=0.0004,
|
||||||
|
mid_channels=None,
|
||||||
|
return_feats=False,
|
||||||
|
**kwargs):
|
||||||
|
super(CTCHead, self).__init__()
|
||||||
|
if mid_channels is None:
|
||||||
|
self.fc = nn.Linear(
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
bias=True,)
|
||||||
|
else:
|
||||||
|
self.fc1 = nn.Linear(
|
||||||
|
in_channels,
|
||||||
|
mid_channels,
|
||||||
|
bias=True,
|
||||||
|
)
|
||||||
|
self.fc2 = nn.Linear(
|
||||||
|
mid_channels,
|
||||||
|
out_channels,
|
||||||
|
bias=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.out_channels = out_channels
|
||||||
|
self.mid_channels = mid_channels
|
||||||
|
self.return_feats = return_feats
|
||||||
|
|
||||||
|
def forward(self, x, labels=None):
|
||||||
|
if self.mid_channels is None:
|
||||||
|
predicts = self.fc(x)
|
||||||
|
else:
|
||||||
|
x = self.fc1(x)
|
||||||
|
predicts = self.fc2(x)
|
||||||
|
|
||||||
|
if self.return_feats:
|
||||||
|
result = dict()
|
||||||
|
result['ctc'] = predicts
|
||||||
|
result['ctc_neck'] = x
|
||||||
|
else:
|
||||||
|
result = predicts
|
||||||
|
|
||||||
|
return result
|
45
AnyText/AnyText_scripts/cldm/ocr_recog/RecModel.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
from torch import nn
|
||||||
|
from .RNN import SequenceEncoder, Im2Seq, Im2Im
|
||||||
|
from .RecMv1_enhance import MobileNetV1Enhance
|
||||||
|
|
||||||
|
from .RecCTCHead import CTCHead
|
||||||
|
|
||||||
|
backbone_dict = {"MobileNetV1Enhance":MobileNetV1Enhance}
|
||||||
|
neck_dict = {'SequenceEncoder': SequenceEncoder, 'Im2Seq': Im2Seq,'None':Im2Im}
|
||||||
|
head_dict = {'CTCHead':CTCHead}
|
||||||
|
|
||||||
|
|
||||||
|
class RecModel(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__()
|
||||||
|
assert 'in_channels' in config, 'in_channels must in model config'
|
||||||
|
backbone_type = config.backbone.pop('type')
|
||||||
|
assert backbone_type in backbone_dict, f'backbone.type must in {backbone_dict}'
|
||||||
|
self.backbone = backbone_dict[backbone_type](config.in_channels, **config.backbone)
|
||||||
|
|
||||||
|
neck_type = config.neck.pop('type')
|
||||||
|
assert neck_type in neck_dict, f'neck.type must in {neck_dict}'
|
||||||
|
self.neck = neck_dict[neck_type](self.backbone.out_channels, **config.neck)
|
||||||
|
|
||||||
|
head_type = config.head.pop('type')
|
||||||
|
assert head_type in head_dict, f'head.type must in {head_dict}'
|
||||||
|
self.head = head_dict[head_type](self.neck.out_channels, **config.head)
|
||||||
|
|
||||||
|
self.name = f'RecModel_{backbone_type}_{neck_type}_{head_type}'
|
||||||
|
|
||||||
|
def load_3rd_state_dict(self, _3rd_name, _state):
|
||||||
|
self.backbone.load_3rd_state_dict(_3rd_name, _state)
|
||||||
|
self.neck.load_3rd_state_dict(_3rd_name, _state)
|
||||||
|
self.head.load_3rd_state_dict(_3rd_name, _state)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.backbone(x)
|
||||||
|
x = self.neck(x)
|
||||||
|
x = self.head(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def encode(self, x):
|
||||||
|
x = self.backbone(x)
|
||||||
|
x = self.neck(x)
|
||||||
|
x = self.head.ctc_encoder(x)
|
||||||
|
return x
|
233
AnyText/AnyText_scripts/cldm/ocr_recog/RecMv1_enhance.py
Normal file
@ -0,0 +1,233 @@
|
|||||||
|
import os, sys
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from .common import Activation
|
||||||
|
|
||||||
|
|
||||||
|
class ConvBNLayer(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
num_channels,
|
||||||
|
filter_size,
|
||||||
|
num_filters,
|
||||||
|
stride,
|
||||||
|
padding,
|
||||||
|
channels=None,
|
||||||
|
num_groups=1,
|
||||||
|
act='hard_swish'):
|
||||||
|
super(ConvBNLayer, self).__init__()
|
||||||
|
self.act = act
|
||||||
|
self._conv = nn.Conv2d(
|
||||||
|
in_channels=num_channels,
|
||||||
|
out_channels=num_filters,
|
||||||
|
kernel_size=filter_size,
|
||||||
|
stride=stride,
|
||||||
|
padding=padding,
|
||||||
|
groups=num_groups,
|
||||||
|
bias=False)
|
||||||
|
|
||||||
|
self._batch_norm = nn.BatchNorm2d(
|
||||||
|
num_filters,
|
||||||
|
)
|
||||||
|
if self.act is not None:
|
||||||
|
self._act = Activation(act_type=act, inplace=True)
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
y = self._conv(inputs)
|
||||||
|
y = self._batch_norm(y)
|
||||||
|
if self.act is not None:
|
||||||
|
y = self._act(y)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
class DepthwiseSeparable(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
num_channels,
|
||||||
|
num_filters1,
|
||||||
|
num_filters2,
|
||||||
|
num_groups,
|
||||||
|
stride,
|
||||||
|
scale,
|
||||||
|
dw_size=3,
|
||||||
|
padding=1,
|
||||||
|
use_se=False):
|
||||||
|
super(DepthwiseSeparable, self).__init__()
|
||||||
|
self.use_se = use_se
|
||||||
|
self._depthwise_conv = ConvBNLayer(
|
||||||
|
num_channels=num_channels,
|
||||||
|
num_filters=int(num_filters1 * scale),
|
||||||
|
filter_size=dw_size,
|
||||||
|
stride=stride,
|
||||||
|
padding=padding,
|
||||||
|
num_groups=int(num_groups * scale))
|
||||||
|
if use_se:
|
||||||
|
self._se = SEModule(int(num_filters1 * scale))
|
||||||
|
self._pointwise_conv = ConvBNLayer(
|
||||||
|
num_channels=int(num_filters1 * scale),
|
||||||
|
filter_size=1,
|
||||||
|
num_filters=int(num_filters2 * scale),
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
y = self._depthwise_conv(inputs)
|
||||||
|
if self.use_se:
|
||||||
|
y = self._se(y)
|
||||||
|
y = self._pointwise_conv(y)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
class MobileNetV1Enhance(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels=3,
|
||||||
|
scale=0.5,
|
||||||
|
last_conv_stride=1,
|
||||||
|
last_pool_type='max',
|
||||||
|
**kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.scale = scale
|
||||||
|
self.block_list = []
|
||||||
|
|
||||||
|
self.conv1 = ConvBNLayer(
|
||||||
|
num_channels=in_channels,
|
||||||
|
filter_size=3,
|
||||||
|
channels=3,
|
||||||
|
num_filters=int(32 * scale),
|
||||||
|
stride=2,
|
||||||
|
padding=1)
|
||||||
|
|
||||||
|
conv2_1 = DepthwiseSeparable(
|
||||||
|
num_channels=int(32 * scale),
|
||||||
|
num_filters1=32,
|
||||||
|
num_filters2=64,
|
||||||
|
num_groups=32,
|
||||||
|
stride=1,
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv2_1)
|
||||||
|
|
||||||
|
conv2_2 = DepthwiseSeparable(
|
||||||
|
num_channels=int(64 * scale),
|
||||||
|
num_filters1=64,
|
||||||
|
num_filters2=128,
|
||||||
|
num_groups=64,
|
||||||
|
stride=1,
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv2_2)
|
||||||
|
|
||||||
|
conv3_1 = DepthwiseSeparable(
|
||||||
|
num_channels=int(128 * scale),
|
||||||
|
num_filters1=128,
|
||||||
|
num_filters2=128,
|
||||||
|
num_groups=128,
|
||||||
|
stride=1,
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv3_1)
|
||||||
|
|
||||||
|
conv3_2 = DepthwiseSeparable(
|
||||||
|
num_channels=int(128 * scale),
|
||||||
|
num_filters1=128,
|
||||||
|
num_filters2=256,
|
||||||
|
num_groups=128,
|
||||||
|
stride=(2, 1),
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv3_2)
|
||||||
|
|
||||||
|
conv4_1 = DepthwiseSeparable(
|
||||||
|
num_channels=int(256 * scale),
|
||||||
|
num_filters1=256,
|
||||||
|
num_filters2=256,
|
||||||
|
num_groups=256,
|
||||||
|
stride=1,
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv4_1)
|
||||||
|
|
||||||
|
conv4_2 = DepthwiseSeparable(
|
||||||
|
num_channels=int(256 * scale),
|
||||||
|
num_filters1=256,
|
||||||
|
num_filters2=512,
|
||||||
|
num_groups=256,
|
||||||
|
stride=(2, 1),
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv4_2)
|
||||||
|
|
||||||
|
for _ in range(5):
|
||||||
|
conv5 = DepthwiseSeparable(
|
||||||
|
num_channels=int(512 * scale),
|
||||||
|
num_filters1=512,
|
||||||
|
num_filters2=512,
|
||||||
|
num_groups=512,
|
||||||
|
stride=1,
|
||||||
|
dw_size=5,
|
||||||
|
padding=2,
|
||||||
|
scale=scale,
|
||||||
|
use_se=False)
|
||||||
|
self.block_list.append(conv5)
|
||||||
|
|
||||||
|
conv5_6 = DepthwiseSeparable(
|
||||||
|
num_channels=int(512 * scale),
|
||||||
|
num_filters1=512,
|
||||||
|
num_filters2=1024,
|
||||||
|
num_groups=512,
|
||||||
|
stride=(2, 1),
|
||||||
|
dw_size=5,
|
||||||
|
padding=2,
|
||||||
|
scale=scale,
|
||||||
|
use_se=True)
|
||||||
|
self.block_list.append(conv5_6)
|
||||||
|
|
||||||
|
conv6 = DepthwiseSeparable(
|
||||||
|
num_channels=int(1024 * scale),
|
||||||
|
num_filters1=1024,
|
||||||
|
num_filters2=1024,
|
||||||
|
num_groups=1024,
|
||||||
|
stride=last_conv_stride,
|
||||||
|
dw_size=5,
|
||||||
|
padding=2,
|
||||||
|
use_se=True,
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv6)
|
||||||
|
|
||||||
|
self.block_list = nn.Sequential(*self.block_list)
|
||||||
|
if last_pool_type == 'avg':
|
||||||
|
self.pool = nn.AvgPool2d(kernel_size=2, stride=2, padding=0)
|
||||||
|
else:
|
||||||
|
self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
|
||||||
|
self.out_channels = int(1024 * scale)
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
y = self.conv1(inputs)
|
||||||
|
y = self.block_list(y)
|
||||||
|
y = self.pool(y)
|
||||||
|
return y
|
||||||
|
|
||||||
|
def hardsigmoid(x):
|
||||||
|
return F.relu6(x + 3., inplace=True) / 6.
|
||||||
|
|
||||||
|
class SEModule(nn.Module):
|
||||||
|
def __init__(self, channel, reduction=4):
|
||||||
|
super(SEModule, self).__init__()
|
||||||
|
self.avg_pool = nn.AdaptiveAvgPool2d(1)
|
||||||
|
self.conv1 = nn.Conv2d(
|
||||||
|
in_channels=channel,
|
||||||
|
out_channels=channel // reduction,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias=True)
|
||||||
|
self.conv2 = nn.Conv2d(
|
||||||
|
in_channels=channel // reduction,
|
||||||
|
out_channels=channel,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias=True)
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
outputs = self.avg_pool(inputs)
|
||||||
|
outputs = self.conv1(outputs)
|
||||||
|
outputs = F.relu(outputs)
|
||||||
|
outputs = self.conv2(outputs)
|
||||||
|
outputs = hardsigmoid(outputs)
|
||||||
|
x = torch.mul(inputs, outputs)
|
||||||
|
|
||||||
|
return x
|
591
AnyText/AnyText_scripts/cldm/ocr_recog/RecSVTR.py
Normal file
@ -0,0 +1,591 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import numpy as np
|
||||||
|
from torch.nn.init import trunc_normal_, zeros_, ones_
|
||||||
|
from torch.nn import functional
|
||||||
|
|
||||||
|
|
||||||
|
def drop_path(x, drop_prob=0., training=False):
|
||||||
|
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||||
|
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
|
||||||
|
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
|
||||||
|
"""
|
||||||
|
if drop_prob == 0. or not training:
|
||||||
|
return x
|
||||||
|
keep_prob = torch.tensor(1 - drop_prob)
|
||||||
|
shape = (x.size()[0], ) + (1, ) * (x.ndim - 1)
|
||||||
|
random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype)
|
||||||
|
random_tensor = torch.floor(random_tensor) # binarize
|
||||||
|
output = x.divide(keep_prob) * random_tensor
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class Swish(nn.Module):
|
||||||
|
def __int__(self):
|
||||||
|
super(Swish, self).__int__()
|
||||||
|
|
||||||
|
def forward(self,x):
|
||||||
|
return x*torch.sigmoid(x)
|
||||||
|
|
||||||
|
|
||||||
|
class ConvBNLayer(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias_attr=False,
|
||||||
|
groups=1,
|
||||||
|
act=nn.GELU):
|
||||||
|
super().__init__()
|
||||||
|
self.conv = nn.Conv2d(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
kernel_size=kernel_size,
|
||||||
|
stride=stride,
|
||||||
|
padding=padding,
|
||||||
|
groups=groups,
|
||||||
|
# weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()),
|
||||||
|
bias=bias_attr)
|
||||||
|
self.norm = nn.BatchNorm2d(out_channels)
|
||||||
|
self.act = act()
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
out = self.conv(inputs)
|
||||||
|
out = self.norm(out)
|
||||||
|
out = self.act(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class DropPath(nn.Module):
|
||||||
|
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, drop_prob=None):
|
||||||
|
super(DropPath, self).__init__()
|
||||||
|
self.drop_prob = drop_prob
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return drop_path(x, self.drop_prob, self.training)
|
||||||
|
|
||||||
|
|
||||||
|
class Identity(nn.Module):
|
||||||
|
def __init__(self):
|
||||||
|
super(Identity, self).__init__()
|
||||||
|
|
||||||
|
def forward(self, input):
|
||||||
|
return input
|
||||||
|
|
||||||
|
|
||||||
|
class Mlp(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_features,
|
||||||
|
hidden_features=None,
|
||||||
|
out_features=None,
|
||||||
|
act_layer=nn.GELU,
|
||||||
|
drop=0.):
|
||||||
|
super().__init__()
|
||||||
|
out_features = out_features or in_features
|
||||||
|
hidden_features = hidden_features or in_features
|
||||||
|
self.fc1 = nn.Linear(in_features, hidden_features)
|
||||||
|
if isinstance(act_layer, str):
|
||||||
|
self.act = Swish()
|
||||||
|
else:
|
||||||
|
self.act = act_layer()
|
||||||
|
self.fc2 = nn.Linear(hidden_features, out_features)
|
||||||
|
self.drop = nn.Dropout(drop)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.fc1(x)
|
||||||
|
x = self.act(x)
|
||||||
|
x = self.drop(x)
|
||||||
|
x = self.fc2(x)
|
||||||
|
x = self.drop(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class ConvMixer(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
dim,
|
||||||
|
num_heads=8,
|
||||||
|
HW=(8, 25),
|
||||||
|
local_k=(3, 3), ):
|
||||||
|
super().__init__()
|
||||||
|
self.HW = HW
|
||||||
|
self.dim = dim
|
||||||
|
self.local_mixer = nn.Conv2d(
|
||||||
|
dim,
|
||||||
|
dim,
|
||||||
|
local_k,
|
||||||
|
1, (local_k[0] // 2, local_k[1] // 2),
|
||||||
|
groups=num_heads,
|
||||||
|
# weight_attr=ParamAttr(initializer=KaimingNormal())
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
h = self.HW[0]
|
||||||
|
w = self.HW[1]
|
||||||
|
x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w])
|
||||||
|
x = self.local_mixer(x)
|
||||||
|
x = x.flatten(2).transpose([0, 2, 1])
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class Attention(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
dim,
|
||||||
|
num_heads=8,
|
||||||
|
mixer='Global',
|
||||||
|
HW=(8, 25),
|
||||||
|
local_k=(7, 11),
|
||||||
|
qkv_bias=False,
|
||||||
|
qk_scale=None,
|
||||||
|
attn_drop=0.,
|
||||||
|
proj_drop=0.):
|
||||||
|
super().__init__()
|
||||||
|
self.num_heads = num_heads
|
||||||
|
head_dim = dim // num_heads
|
||||||
|
self.scale = qk_scale or head_dim**-0.5
|
||||||
|
|
||||||
|
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
||||||
|
self.attn_drop = nn.Dropout(attn_drop)
|
||||||
|
self.proj = nn.Linear(dim, dim)
|
||||||
|
self.proj_drop = nn.Dropout(proj_drop)
|
||||||
|
self.HW = HW
|
||||||
|
if HW is not None:
|
||||||
|
H = HW[0]
|
||||||
|
W = HW[1]
|
||||||
|
self.N = H * W
|
||||||
|
self.C = dim
|
||||||
|
if mixer == 'Local' and HW is not None:
|
||||||
|
hk = local_k[0]
|
||||||
|
wk = local_k[1]
|
||||||
|
mask = torch.ones([H * W, H + hk - 1, W + wk - 1])
|
||||||
|
for h in range(0, H):
|
||||||
|
for w in range(0, W):
|
||||||
|
mask[h * W + w, h:h + hk, w:w + wk] = 0.
|
||||||
|
mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk //
|
||||||
|
2].flatten(1)
|
||||||
|
mask_inf = torch.full([H * W, H * W],fill_value=float('-inf'))
|
||||||
|
mask = torch.where(mask_paddle < 1, mask_paddle, mask_inf)
|
||||||
|
self.mask = mask[None,None,:]
|
||||||
|
# self.mask = mask.unsqueeze([0, 1])
|
||||||
|
self.mixer = mixer
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.HW is not None:
|
||||||
|
N = self.N
|
||||||
|
C = self.C
|
||||||
|
else:
|
||||||
|
_, N, C = x.shape
|
||||||
|
qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //self.num_heads)).permute((2, 0, 3, 1, 4))
|
||||||
|
q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
|
||||||
|
|
||||||
|
attn = (q.matmul(k.permute((0, 1, 3, 2))))
|
||||||
|
if self.mixer == 'Local':
|
||||||
|
attn += self.mask
|
||||||
|
attn = functional.softmax(attn, dim=-1)
|
||||||
|
attn = self.attn_drop(attn)
|
||||||
|
|
||||||
|
x = (attn.matmul(v)).permute((0, 2, 1, 3)).reshape((-1, N, C))
|
||||||
|
x = self.proj(x)
|
||||||
|
x = self.proj_drop(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class Block(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
dim,
|
||||||
|
num_heads,
|
||||||
|
mixer='Global',
|
||||||
|
local_mixer=(7, 11),
|
||||||
|
HW=(8, 25),
|
||||||
|
mlp_ratio=4.,
|
||||||
|
qkv_bias=False,
|
||||||
|
qk_scale=None,
|
||||||
|
drop=0.,
|
||||||
|
attn_drop=0.,
|
||||||
|
drop_path=0.,
|
||||||
|
act_layer=nn.GELU,
|
||||||
|
norm_layer='nn.LayerNorm',
|
||||||
|
epsilon=1e-6,
|
||||||
|
prenorm=True):
|
||||||
|
super().__init__()
|
||||||
|
if isinstance(norm_layer, str):
|
||||||
|
self.norm1 = eval(norm_layer)(dim, eps=epsilon)
|
||||||
|
else:
|
||||||
|
self.norm1 = norm_layer(dim)
|
||||||
|
if mixer == 'Global' or mixer == 'Local':
|
||||||
|
|
||||||
|
self.mixer = Attention(
|
||||||
|
dim,
|
||||||
|
num_heads=num_heads,
|
||||||
|
mixer=mixer,
|
||||||
|
HW=HW,
|
||||||
|
local_k=local_mixer,
|
||||||
|
qkv_bias=qkv_bias,
|
||||||
|
qk_scale=qk_scale,
|
||||||
|
attn_drop=attn_drop,
|
||||||
|
proj_drop=drop)
|
||||||
|
elif mixer == 'Conv':
|
||||||
|
self.mixer = ConvMixer(
|
||||||
|
dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
|
||||||
|
else:
|
||||||
|
raise TypeError("The mixer must be one of [Global, Local, Conv]")
|
||||||
|
|
||||||
|
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
|
||||||
|
if isinstance(norm_layer, str):
|
||||||
|
self.norm2 = eval(norm_layer)(dim, eps=epsilon)
|
||||||
|
else:
|
||||||
|
self.norm2 = norm_layer(dim)
|
||||||
|
mlp_hidden_dim = int(dim * mlp_ratio)
|
||||||
|
self.mlp_ratio = mlp_ratio
|
||||||
|
self.mlp = Mlp(in_features=dim,
|
||||||
|
hidden_features=mlp_hidden_dim,
|
||||||
|
act_layer=act_layer,
|
||||||
|
drop=drop)
|
||||||
|
self.prenorm = prenorm
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.prenorm:
|
||||||
|
x = self.norm1(x + self.drop_path(self.mixer(x)))
|
||||||
|
x = self.norm2(x + self.drop_path(self.mlp(x)))
|
||||||
|
else:
|
||||||
|
x = x + self.drop_path(self.mixer(self.norm1(x)))
|
||||||
|
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class PatchEmbed(nn.Module):
|
||||||
|
""" Image to Patch Embedding
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
img_size=(32, 100),
|
||||||
|
in_channels=3,
|
||||||
|
embed_dim=768,
|
||||||
|
sub_num=2):
|
||||||
|
super().__init__()
|
||||||
|
num_patches = (img_size[1] // (2 ** sub_num)) * \
|
||||||
|
(img_size[0] // (2 ** sub_num))
|
||||||
|
self.img_size = img_size
|
||||||
|
self.num_patches = num_patches
|
||||||
|
self.embed_dim = embed_dim
|
||||||
|
self.norm = None
|
||||||
|
if sub_num == 2:
|
||||||
|
self.proj = nn.Sequential(
|
||||||
|
ConvBNLayer(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=embed_dim // 2,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
act=nn.GELU,
|
||||||
|
bias_attr=False),
|
||||||
|
ConvBNLayer(
|
||||||
|
in_channels=embed_dim // 2,
|
||||||
|
out_channels=embed_dim,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
act=nn.GELU,
|
||||||
|
bias_attr=False))
|
||||||
|
if sub_num == 3:
|
||||||
|
self.proj = nn.Sequential(
|
||||||
|
ConvBNLayer(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=embed_dim // 4,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
act=nn.GELU,
|
||||||
|
bias_attr=False),
|
||||||
|
ConvBNLayer(
|
||||||
|
in_channels=embed_dim // 4,
|
||||||
|
out_channels=embed_dim // 2,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
act=nn.GELU,
|
||||||
|
bias_attr=False),
|
||||||
|
ConvBNLayer(
|
||||||
|
in_channels=embed_dim // 2,
|
||||||
|
out_channels=embed_dim,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
act=nn.GELU,
|
||||||
|
bias_attr=False))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
B, C, H, W = x.shape
|
||||||
|
assert H == self.img_size[0] and W == self.img_size[1], \
|
||||||
|
f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
|
||||||
|
x = self.proj(x).flatten(2).permute(0, 2, 1)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class SubSample(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
types='Pool',
|
||||||
|
stride=(2, 1),
|
||||||
|
sub_norm='nn.LayerNorm',
|
||||||
|
act=None):
|
||||||
|
super().__init__()
|
||||||
|
self.types = types
|
||||||
|
if types == 'Pool':
|
||||||
|
self.avgpool = nn.AvgPool2d(
|
||||||
|
kernel_size=(3, 5), stride=stride, padding=(1, 2))
|
||||||
|
self.maxpool = nn.MaxPool2d(
|
||||||
|
kernel_size=(3, 5), stride=stride, padding=(1, 2))
|
||||||
|
self.proj = nn.Linear(in_channels, out_channels)
|
||||||
|
else:
|
||||||
|
self.conv = nn.Conv2d(
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=stride,
|
||||||
|
padding=1,
|
||||||
|
# weight_attr=ParamAttr(initializer=KaimingNormal())
|
||||||
|
)
|
||||||
|
self.norm = eval(sub_norm)(out_channels)
|
||||||
|
if act is not None:
|
||||||
|
self.act = act()
|
||||||
|
else:
|
||||||
|
self.act = None
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
|
||||||
|
if self.types == 'Pool':
|
||||||
|
x1 = self.avgpool(x)
|
||||||
|
x2 = self.maxpool(x)
|
||||||
|
x = (x1 + x2) * 0.5
|
||||||
|
out = self.proj(x.flatten(2).permute((0, 2, 1)))
|
||||||
|
else:
|
||||||
|
x = self.conv(x)
|
||||||
|
out = x.flatten(2).permute((0, 2, 1))
|
||||||
|
out = self.norm(out)
|
||||||
|
if self.act is not None:
|
||||||
|
out = self.act(out)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class SVTRNet(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
img_size=[48, 100],
|
||||||
|
in_channels=3,
|
||||||
|
embed_dim=[64, 128, 256],
|
||||||
|
depth=[3, 6, 3],
|
||||||
|
num_heads=[2, 4, 8],
|
||||||
|
mixer=['Local'] * 6 + ['Global'] *
|
||||||
|
6, # Local atten, Global atten, Conv
|
||||||
|
local_mixer=[[7, 11], [7, 11], [7, 11]],
|
||||||
|
patch_merging='Conv', # Conv, Pool, None
|
||||||
|
mlp_ratio=4,
|
||||||
|
qkv_bias=True,
|
||||||
|
qk_scale=None,
|
||||||
|
drop_rate=0.,
|
||||||
|
last_drop=0.1,
|
||||||
|
attn_drop_rate=0.,
|
||||||
|
drop_path_rate=0.1,
|
||||||
|
norm_layer='nn.LayerNorm',
|
||||||
|
sub_norm='nn.LayerNorm',
|
||||||
|
epsilon=1e-6,
|
||||||
|
out_channels=192,
|
||||||
|
out_char_num=25,
|
||||||
|
block_unit='Block',
|
||||||
|
act='nn.GELU',
|
||||||
|
last_stage=True,
|
||||||
|
sub_num=2,
|
||||||
|
prenorm=True,
|
||||||
|
use_lenhead=False,
|
||||||
|
**kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.img_size = img_size
|
||||||
|
self.embed_dim = embed_dim
|
||||||
|
self.out_channels = out_channels
|
||||||
|
self.prenorm = prenorm
|
||||||
|
patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging
|
||||||
|
self.patch_embed = PatchEmbed(
|
||||||
|
img_size=img_size,
|
||||||
|
in_channels=in_channels,
|
||||||
|
embed_dim=embed_dim[0],
|
||||||
|
sub_num=sub_num)
|
||||||
|
num_patches = self.patch_embed.num_patches
|
||||||
|
self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)]
|
||||||
|
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim[0]))
|
||||||
|
# self.pos_embed = self.create_parameter(
|
||||||
|
# shape=[1, num_patches, embed_dim[0]], default_initializer=zeros_)
|
||||||
|
|
||||||
|
# self.add_parameter("pos_embed", self.pos_embed)
|
||||||
|
|
||||||
|
self.pos_drop = nn.Dropout(p=drop_rate)
|
||||||
|
Block_unit = eval(block_unit)
|
||||||
|
|
||||||
|
dpr = np.linspace(0, drop_path_rate, sum(depth))
|
||||||
|
self.blocks1 = nn.ModuleList(
|
||||||
|
[
|
||||||
|
Block_unit(
|
||||||
|
dim=embed_dim[0],
|
||||||
|
num_heads=num_heads[0],
|
||||||
|
mixer=mixer[0:depth[0]][i],
|
||||||
|
HW=self.HW,
|
||||||
|
local_mixer=local_mixer[0],
|
||||||
|
mlp_ratio=mlp_ratio,
|
||||||
|
qkv_bias=qkv_bias,
|
||||||
|
qk_scale=qk_scale,
|
||||||
|
drop=drop_rate,
|
||||||
|
act_layer=eval(act),
|
||||||
|
attn_drop=attn_drop_rate,
|
||||||
|
drop_path=dpr[0:depth[0]][i],
|
||||||
|
norm_layer=norm_layer,
|
||||||
|
epsilon=epsilon,
|
||||||
|
prenorm=prenorm) for i in range(depth[0])
|
||||||
|
]
|
||||||
|
)
|
||||||
|
if patch_merging is not None:
|
||||||
|
self.sub_sample1 = SubSample(
|
||||||
|
embed_dim[0],
|
||||||
|
embed_dim[1],
|
||||||
|
sub_norm=sub_norm,
|
||||||
|
stride=[2, 1],
|
||||||
|
types=patch_merging)
|
||||||
|
HW = [self.HW[0] // 2, self.HW[1]]
|
||||||
|
else:
|
||||||
|
HW = self.HW
|
||||||
|
self.patch_merging = patch_merging
|
||||||
|
self.blocks2 = nn.ModuleList([
|
||||||
|
Block_unit(
|
||||||
|
dim=embed_dim[1],
|
||||||
|
num_heads=num_heads[1],
|
||||||
|
mixer=mixer[depth[0]:depth[0] + depth[1]][i],
|
||||||
|
HW=HW,
|
||||||
|
local_mixer=local_mixer[1],
|
||||||
|
mlp_ratio=mlp_ratio,
|
||||||
|
qkv_bias=qkv_bias,
|
||||||
|
qk_scale=qk_scale,
|
||||||
|
drop=drop_rate,
|
||||||
|
act_layer=eval(act),
|
||||||
|
attn_drop=attn_drop_rate,
|
||||||
|
drop_path=dpr[depth[0]:depth[0] + depth[1]][i],
|
||||||
|
norm_layer=norm_layer,
|
||||||
|
epsilon=epsilon,
|
||||||
|
prenorm=prenorm) for i in range(depth[1])
|
||||||
|
])
|
||||||
|
if patch_merging is not None:
|
||||||
|
self.sub_sample2 = SubSample(
|
||||||
|
embed_dim[1],
|
||||||
|
embed_dim[2],
|
||||||
|
sub_norm=sub_norm,
|
||||||
|
stride=[2, 1],
|
||||||
|
types=patch_merging)
|
||||||
|
HW = [self.HW[0] // 4, self.HW[1]]
|
||||||
|
else:
|
||||||
|
HW = self.HW
|
||||||
|
self.blocks3 = nn.ModuleList([
|
||||||
|
Block_unit(
|
||||||
|
dim=embed_dim[2],
|
||||||
|
num_heads=num_heads[2],
|
||||||
|
mixer=mixer[depth[0] + depth[1]:][i],
|
||||||
|
HW=HW,
|
||||||
|
local_mixer=local_mixer[2],
|
||||||
|
mlp_ratio=mlp_ratio,
|
||||||
|
qkv_bias=qkv_bias,
|
||||||
|
qk_scale=qk_scale,
|
||||||
|
drop=drop_rate,
|
||||||
|
act_layer=eval(act),
|
||||||
|
attn_drop=attn_drop_rate,
|
||||||
|
drop_path=dpr[depth[0] + depth[1]:][i],
|
||||||
|
norm_layer=norm_layer,
|
||||||
|
epsilon=epsilon,
|
||||||
|
prenorm=prenorm) for i in range(depth[2])
|
||||||
|
])
|
||||||
|
self.last_stage = last_stage
|
||||||
|
if last_stage:
|
||||||
|
self.avg_pool = nn.AdaptiveAvgPool2d((1, out_char_num))
|
||||||
|
self.last_conv = nn.Conv2d(
|
||||||
|
in_channels=embed_dim[2],
|
||||||
|
out_channels=self.out_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias=False)
|
||||||
|
self.hardswish = nn.Hardswish()
|
||||||
|
self.dropout = nn.Dropout(p=last_drop)
|
||||||
|
if not prenorm:
|
||||||
|
self.norm = eval(norm_layer)(embed_dim[-1], epsilon=epsilon)
|
||||||
|
self.use_lenhead = use_lenhead
|
||||||
|
if use_lenhead:
|
||||||
|
self.len_conv = nn.Linear(embed_dim[2], self.out_channels)
|
||||||
|
self.hardswish_len = nn.Hardswish()
|
||||||
|
self.dropout_len = nn.Dropout(
|
||||||
|
p=last_drop)
|
||||||
|
|
||||||
|
trunc_normal_(self.pos_embed,std=.02)
|
||||||
|
self.apply(self._init_weights)
|
||||||
|
|
||||||
|
def _init_weights(self, m):
|
||||||
|
if isinstance(m, nn.Linear):
|
||||||
|
trunc_normal_(m.weight,std=.02)
|
||||||
|
if isinstance(m, nn.Linear) and m.bias is not None:
|
||||||
|
zeros_(m.bias)
|
||||||
|
elif isinstance(m, nn.LayerNorm):
|
||||||
|
zeros_(m.bias)
|
||||||
|
ones_(m.weight)
|
||||||
|
|
||||||
|
def forward_features(self, x):
|
||||||
|
x = self.patch_embed(x)
|
||||||
|
x = x + self.pos_embed
|
||||||
|
x = self.pos_drop(x)
|
||||||
|
for blk in self.blocks1:
|
||||||
|
x = blk(x)
|
||||||
|
if self.patch_merging is not None:
|
||||||
|
x = self.sub_sample1(
|
||||||
|
x.permute([0, 2, 1]).reshape(
|
||||||
|
[-1, self.embed_dim[0], self.HW[0], self.HW[1]]))
|
||||||
|
for blk in self.blocks2:
|
||||||
|
x = blk(x)
|
||||||
|
if self.patch_merging is not None:
|
||||||
|
x = self.sub_sample2(
|
||||||
|
x.permute([0, 2, 1]).reshape(
|
||||||
|
[-1, self.embed_dim[1], self.HW[0] // 2, self.HW[1]]))
|
||||||
|
for blk in self.blocks3:
|
||||||
|
x = blk(x)
|
||||||
|
if not self.prenorm:
|
||||||
|
x = self.norm(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.forward_features(x)
|
||||||
|
if self.use_lenhead:
|
||||||
|
len_x = self.len_conv(x.mean(1))
|
||||||
|
len_x = self.dropout_len(self.hardswish_len(len_x))
|
||||||
|
if self.last_stage:
|
||||||
|
if self.patch_merging is not None:
|
||||||
|
h = self.HW[0] // 4
|
||||||
|
else:
|
||||||
|
h = self.HW[0]
|
||||||
|
x = self.avg_pool(
|
||||||
|
x.permute([0, 2, 1]).reshape(
|
||||||
|
[-1, self.embed_dim[2], h, self.HW[1]]))
|
||||||
|
x = self.last_conv(x)
|
||||||
|
x = self.hardswish(x)
|
||||||
|
x = self.dropout(x)
|
||||||
|
if self.use_lenhead:
|
||||||
|
return x, len_x
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
if __name__=="__main__":
|
||||||
|
a = torch.rand(1,3,48,100)
|
||||||
|
svtr = SVTRNet()
|
||||||
|
|
||||||
|
out = svtr(a)
|
||||||
|
print(svtr)
|
||||||
|
print(out.size())
|
74
AnyText/AnyText_scripts/cldm/ocr_recog/common.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
class Hswish(nn.Module):
|
||||||
|
def __init__(self, inplace=True):
|
||||||
|
super(Hswish, self).__init__()
|
||||||
|
self.inplace = inplace
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return x * F.relu6(x + 3., inplace=self.inplace) / 6.
|
||||||
|
|
||||||
|
# out = max(0, min(1, slop*x+offset))
|
||||||
|
# paddle.fluid.layers.hard_sigmoid(x, slope=0.2, offset=0.5, name=None)
|
||||||
|
class Hsigmoid(nn.Module):
|
||||||
|
def __init__(self, inplace=True):
|
||||||
|
super(Hsigmoid, self).__init__()
|
||||||
|
self.inplace = inplace
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# torch: F.relu6(x + 3., inplace=self.inplace) / 6.
|
||||||
|
# paddle: F.relu6(1.2 * x + 3., inplace=self.inplace) / 6.
|
||||||
|
return F.relu6(1.2 * x + 3., inplace=self.inplace) / 6.
|
||||||
|
|
||||||
|
class GELU(nn.Module):
|
||||||
|
def __init__(self, inplace=True):
|
||||||
|
super(GELU, self).__init__()
|
||||||
|
self.inplace = inplace
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return torch.nn.functional.gelu(x)
|
||||||
|
|
||||||
|
|
||||||
|
class Swish(nn.Module):
|
||||||
|
def __init__(self, inplace=True):
|
||||||
|
super(Swish, self).__init__()
|
||||||
|
self.inplace = inplace
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.inplace:
|
||||||
|
x.mul_(torch.sigmoid(x))
|
||||||
|
return x
|
||||||
|
else:
|
||||||
|
return x*torch.sigmoid(x)
|
||||||
|
|
||||||
|
|
||||||
|
class Activation(nn.Module):
|
||||||
|
def __init__(self, act_type, inplace=True):
|
||||||
|
super(Activation, self).__init__()
|
||||||
|
act_type = act_type.lower()
|
||||||
|
if act_type == 'relu':
|
||||||
|
self.act = nn.ReLU(inplace=inplace)
|
||||||
|
elif act_type == 'relu6':
|
||||||
|
self.act = nn.ReLU6(inplace=inplace)
|
||||||
|
elif act_type == 'sigmoid':
|
||||||
|
raise NotImplementedError
|
||||||
|
elif act_type == 'hard_sigmoid':
|
||||||
|
self.act = Hsigmoid(inplace)
|
||||||
|
elif act_type == 'hard_swish':
|
||||||
|
self.act = Hswish(inplace=inplace)
|
||||||
|
elif act_type == 'leakyrelu':
|
||||||
|
self.act = nn.LeakyReLU(inplace=inplace)
|
||||||
|
elif act_type == 'gelu':
|
||||||
|
self.act = GELU(inplace=inplace)
|
||||||
|
elif act_type == 'swish':
|
||||||
|
self.act = Swish(inplace=inplace)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
return self.act(inputs)
|
95
AnyText/AnyText_scripts/cldm/ocr_recog/en_dict.txt
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
0
|
||||||
|
1
|
||||||
|
2
|
||||||
|
3
|
||||||
|
4
|
||||||
|
5
|
||||||
|
6
|
||||||
|
7
|
||||||
|
8
|
||||||
|
9
|
||||||
|
:
|
||||||
|
;
|
||||||
|
<
|
||||||
|
=
|
||||||
|
>
|
||||||
|
?
|
||||||
|
@
|
||||||
|
A
|
||||||
|
B
|
||||||
|
C
|
||||||
|
D
|
||||||
|
E
|
||||||
|
F
|
||||||
|
G
|
||||||
|
H
|
||||||
|
I
|
||||||
|
J
|
||||||
|
K
|
||||||
|
L
|
||||||
|
M
|
||||||
|
N
|
||||||
|
O
|
||||||
|
P
|
||||||
|
Q
|
||||||
|
R
|
||||||
|
S
|
||||||
|
T
|
||||||
|
U
|
||||||
|
V
|
||||||
|
W
|
||||||
|
X
|
||||||
|
Y
|
||||||
|
Z
|
||||||
|
[
|
||||||
|
\
|
||||||
|
]
|
||||||
|
^
|
||||||
|
_
|
||||||
|
`
|
||||||
|
a
|
||||||
|
b
|
||||||
|
c
|
||||||
|
d
|
||||||
|
e
|
||||||
|
f
|
||||||
|
g
|
||||||
|
h
|
||||||
|
i
|
||||||
|
j
|
||||||
|
k
|
||||||
|
l
|
||||||
|
m
|
||||||
|
n
|
||||||
|
o
|
||||||
|
p
|
||||||
|
q
|
||||||
|
r
|
||||||
|
s
|
||||||
|
t
|
||||||
|
u
|
||||||
|
v
|
||||||
|
w
|
||||||
|
x
|
||||||
|
y
|
||||||
|
z
|
||||||
|
{
|
||||||
|
|
|
||||||
|
}
|
||||||
|
~
|
||||||
|
!
|
||||||
|
"
|
||||||
|
#
|
||||||
|
$
|
||||||
|
%
|
||||||
|
&
|
||||||
|
'
|
||||||
|
(
|
||||||
|
)
|
||||||
|
*
|
||||||
|
+
|
||||||
|
,
|
||||||
|
-
|
||||||
|
.
|
||||||
|
/
|
||||||
|
|
6623
AnyText/AnyText_scripts/cldm/ocr_recog/ppocr_keys_v1.txt
Normal file
310
AnyText/AnyText_scripts/cldm/recognizer.py
Normal file
@ -0,0 +1,310 @@
|
|||||||
|
'''
|
||||||
|
Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
'''
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import math
|
||||||
|
import traceback
|
||||||
|
from easydict import EasyDict as edict
|
||||||
|
import time
|
||||||
|
from .ocr_recog.RecModel import RecModel
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from skimage.transform._geometric import _umeyama as get_sym_mat
|
||||||
|
|
||||||
|
current_directory = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
ocr_txt_path = os.path.join(os.path.dirname(os.path.dirname(current_directory)), "ocr_weights", "ppocr_keys_v1.txt")
|
||||||
|
ocr_model_path = os.path.join(os.path.dirname(os.path.dirname(current_directory)), "ocr_weights", "ppv3_rec.pth")
|
||||||
|
|
||||||
|
def min_bounding_rect(img):
|
||||||
|
ret, thresh = cv2.threshold(img, 127, 255, 0)
|
||||||
|
contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
if len(contours) == 0:
|
||||||
|
print('Bad contours, using fake bbox...')
|
||||||
|
return np.array([[0, 0], [100, 0], [100, 100], [0, 100]])
|
||||||
|
max_contour = max(contours, key=cv2.contourArea)
|
||||||
|
rect = cv2.minAreaRect(max_contour)
|
||||||
|
box = cv2.boxPoints(rect)
|
||||||
|
box = np.int0(box)
|
||||||
|
# sort
|
||||||
|
x_sorted = sorted(box, key=lambda x: x[0])
|
||||||
|
left = x_sorted[:2]
|
||||||
|
right = x_sorted[2:]
|
||||||
|
left = sorted(left, key=lambda x: x[1])
|
||||||
|
(tl, bl) = left
|
||||||
|
right = sorted(right, key=lambda x: x[1])
|
||||||
|
(tr, br) = right
|
||||||
|
if tl[1] > bl[1]:
|
||||||
|
(tl, bl) = (bl, tl)
|
||||||
|
if tr[1] > br[1]:
|
||||||
|
(tr, br) = (br, tr)
|
||||||
|
return np.array([tl, tr, br, bl])
|
||||||
|
|
||||||
|
|
||||||
|
def adjust_image(box, img):
|
||||||
|
pts1 = np.float32([box[0], box[1], box[2], box[3]])
|
||||||
|
width = max(np.linalg.norm(pts1[0]-pts1[1]), np.linalg.norm(pts1[2]-pts1[3]))
|
||||||
|
height = max(np.linalg.norm(pts1[0]-pts1[3]), np.linalg.norm(pts1[1]-pts1[2]))
|
||||||
|
pts2 = np.float32([[0, 0], [width, 0], [width, height], [0, height]])
|
||||||
|
# get transform matrix
|
||||||
|
M = get_sym_mat(pts1, pts2, estimate_scale=True)
|
||||||
|
C, H, W = img.shape
|
||||||
|
T = np.array([[2 / W, 0, -1], [0, 2 / H, -1], [0, 0, 1]])
|
||||||
|
theta = np.linalg.inv(T @ M @ np.linalg.inv(T))
|
||||||
|
theta = torch.from_numpy(theta[:2, :]).unsqueeze(0).type(torch.float32).to(img.device)
|
||||||
|
grid = F.affine_grid(theta, torch.Size([1, C, H, W]), align_corners=True)
|
||||||
|
result = F.grid_sample(img.unsqueeze(0), grid, align_corners=True)
|
||||||
|
result = torch.clamp(result.squeeze(0), 0, 255)
|
||||||
|
# crop
|
||||||
|
result = result[:, :int(height), :int(width)]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
mask: numpy.ndarray, mask of textual, HWC
|
||||||
|
src_img: torch.Tensor, source image, CHW
|
||||||
|
'''
|
||||||
|
def crop_image(src_img, mask):
|
||||||
|
box = min_bounding_rect(mask)
|
||||||
|
result = adjust_image(box, src_img)
|
||||||
|
if len(result.shape) == 2:
|
||||||
|
result = torch.stack([result]*3, axis=-1)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def create_predictor(model_dir=None, model_lang='ch', is_onnx=False):
|
||||||
|
model_file_path = model_dir
|
||||||
|
if model_file_path is not None and not os.path.exists(model_file_path):
|
||||||
|
raise ValueError("not find model file path {}".format(model_file_path))
|
||||||
|
|
||||||
|
if is_onnx:
|
||||||
|
import onnxruntime as ort
|
||||||
|
sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider']) # 'TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'
|
||||||
|
return sess
|
||||||
|
else:
|
||||||
|
if model_lang == 'ch':
|
||||||
|
n_class = 6625
|
||||||
|
elif model_lang == 'en':
|
||||||
|
n_class = 97
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported OCR recog model_lang: {model_lang}")
|
||||||
|
rec_config = edict(
|
||||||
|
in_channels=3,
|
||||||
|
backbone=edict(type='MobileNetV1Enhance', scale=0.5, last_conv_stride=[1, 2], last_pool_type='avg'),
|
||||||
|
neck=edict(type='SequenceEncoder', encoder_type="svtr", dims=64, depth=2, hidden_dims=120, use_guide=True),
|
||||||
|
head=edict(type='CTCHead', fc_decay=0.00001, out_channels=n_class, return_feats=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
rec_model = RecModel(rec_config)
|
||||||
|
if model_file_path is not None:
|
||||||
|
rec_model.load_state_dict(torch.load(model_file_path, map_location="cpu"))
|
||||||
|
rec_model.eval()
|
||||||
|
return rec_model.eval()
|
||||||
|
|
||||||
|
|
||||||
|
def _check_image_file(path):
|
||||||
|
img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff'}
|
||||||
|
return any([path.lower().endswith(e) for e in img_end])
|
||||||
|
|
||||||
|
|
||||||
|
def get_image_file_list(img_file):
|
||||||
|
imgs_lists = []
|
||||||
|
if img_file is None or not os.path.exists(img_file):
|
||||||
|
raise Exception("not found any img file in {}".format(img_file))
|
||||||
|
if os.path.isfile(img_file) and _check_image_file(img_file):
|
||||||
|
imgs_lists.append(img_file)
|
||||||
|
elif os.path.isdir(img_file):
|
||||||
|
for single_file in os.listdir(img_file):
|
||||||
|
file_path = os.path.join(img_file, single_file)
|
||||||
|
if os.path.isfile(file_path) and _check_image_file(file_path):
|
||||||
|
imgs_lists.append(file_path)
|
||||||
|
if len(imgs_lists) == 0:
|
||||||
|
raise Exception("not found any img file in {}".format(img_file))
|
||||||
|
imgs_lists = sorted(imgs_lists)
|
||||||
|
return imgs_lists
|
||||||
|
|
||||||
|
|
||||||
|
class TextRecognizer(object):
|
||||||
|
def __init__(self, args, predictor):
|
||||||
|
self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
|
||||||
|
self.rec_batch_num = args.rec_batch_num
|
||||||
|
self.predictor = predictor
|
||||||
|
self.chars = self.get_char_dict(ocr_txt_path)
|
||||||
|
self.char2id = {x: i for i, x in enumerate(self.chars)}
|
||||||
|
self.is_onnx = not isinstance(self.predictor, torch.nn.Module)
|
||||||
|
self.use_fp16 = args.use_fp16
|
||||||
|
|
||||||
|
# img: CHW
|
||||||
|
def resize_norm_img(self, img, max_wh_ratio):
|
||||||
|
imgC, imgH, imgW = self.rec_image_shape
|
||||||
|
assert imgC == img.shape[0]
|
||||||
|
imgW = int((imgH * max_wh_ratio))
|
||||||
|
|
||||||
|
h, w = img.shape[1:]
|
||||||
|
ratio = w / float(h)
|
||||||
|
if math.ceil(imgH * ratio) > imgW:
|
||||||
|
resized_w = imgW
|
||||||
|
else:
|
||||||
|
resized_w = int(math.ceil(imgH * ratio))
|
||||||
|
resized_image = torch.nn.functional.interpolate(
|
||||||
|
img.unsqueeze(0),
|
||||||
|
size=(imgH, resized_w),
|
||||||
|
mode='bilinear',
|
||||||
|
align_corners=True,
|
||||||
|
)
|
||||||
|
resized_image /= 255.0
|
||||||
|
resized_image -= 0.5
|
||||||
|
resized_image /= 0.5
|
||||||
|
padding_im = torch.zeros((imgC, imgH, imgW), dtype=torch.float32).to(img.device)
|
||||||
|
padding_im[:, :, 0:resized_w] = resized_image[0]
|
||||||
|
return padding_im
|
||||||
|
|
||||||
|
# img_list: list of tensors with shape chw 0-255
|
||||||
|
def pred_imglist(self, img_list, show_debug=False):
|
||||||
|
img_num = len(img_list)
|
||||||
|
assert img_num > 0
|
||||||
|
# Calculate the aspect ratio of all text bars
|
||||||
|
width_list = []
|
||||||
|
for img in img_list:
|
||||||
|
width_list.append(img.shape[2] / float(img.shape[1]))
|
||||||
|
# Sorting can speed up the recognition process
|
||||||
|
indices = torch.from_numpy(np.argsort(np.array(width_list)))
|
||||||
|
batch_num = self.rec_batch_num
|
||||||
|
preds_all = [None] * img_num
|
||||||
|
preds_neck_all = [None] * img_num
|
||||||
|
for beg_img_no in range(0, img_num, batch_num):
|
||||||
|
end_img_no = min(img_num, beg_img_no + batch_num)
|
||||||
|
norm_img_batch = []
|
||||||
|
|
||||||
|
imgC, imgH, imgW = self.rec_image_shape[:3]
|
||||||
|
max_wh_ratio = imgW / imgH
|
||||||
|
for ino in range(beg_img_no, end_img_no):
|
||||||
|
h, w = img_list[indices[ino]].shape[1:]
|
||||||
|
if h > w * 1.2:
|
||||||
|
img = img_list[indices[ino]]
|
||||||
|
img = torch.transpose(img, 1, 2).flip(dims=[1])
|
||||||
|
img_list[indices[ino]] = img
|
||||||
|
h, w = img.shape[1:]
|
||||||
|
# wh_ratio = w * 1.0 / h
|
||||||
|
# max_wh_ratio = max(max_wh_ratio, wh_ratio) # comment to not use different ratio
|
||||||
|
for ino in range(beg_img_no, end_img_no):
|
||||||
|
norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio)
|
||||||
|
if self.use_fp16:
|
||||||
|
norm_img = norm_img.half()
|
||||||
|
norm_img = norm_img.unsqueeze(0)
|
||||||
|
norm_img_batch.append(norm_img)
|
||||||
|
norm_img_batch = torch.cat(norm_img_batch, dim=0)
|
||||||
|
if show_debug:
|
||||||
|
for i in range(len(norm_img_batch)):
|
||||||
|
_img = norm_img_batch[i].permute(1, 2, 0).detach().cpu().numpy()
|
||||||
|
_img = (_img + 0.5)*255
|
||||||
|
_img = _img[:, :, ::-1]
|
||||||
|
file_name = f'{indices[beg_img_no + i]}'
|
||||||
|
if os.path.exists(file_name + '.jpg'):
|
||||||
|
file_name += '_2' # ori image
|
||||||
|
cv2.imwrite(file_name + '.jpg', _img)
|
||||||
|
if self.is_onnx:
|
||||||
|
input_dict = {}
|
||||||
|
input_dict[self.predictor.get_inputs()[0].name] = norm_img_batch.detach().cpu().numpy()
|
||||||
|
outputs = self.predictor.run(None, input_dict)
|
||||||
|
preds = {}
|
||||||
|
preds['ctc'] = torch.from_numpy(outputs[0])
|
||||||
|
preds['ctc_neck'] = [torch.zeros(1)] * img_num
|
||||||
|
else:
|
||||||
|
preds = self.predictor(norm_img_batch)
|
||||||
|
for rno in range(preds['ctc'].shape[0]):
|
||||||
|
preds_all[indices[beg_img_no + rno]] = preds['ctc'][rno]
|
||||||
|
preds_neck_all[indices[beg_img_no + rno]] = preds['ctc_neck'][rno]
|
||||||
|
|
||||||
|
return torch.stack(preds_all, dim=0), torch.stack(preds_neck_all, dim=0)
|
||||||
|
|
||||||
|
def get_char_dict(self, character_dict_path):
|
||||||
|
character_str = []
|
||||||
|
with open(character_dict_path, "rb") as fin:
|
||||||
|
lines = fin.readlines()
|
||||||
|
for line in lines:
|
||||||
|
line = line.decode('utf-8').strip("\n").strip("\r\n")
|
||||||
|
character_str.append(line)
|
||||||
|
dict_character = list(character_str)
|
||||||
|
dict_character = ['sos'] + dict_character + [' '] # eos is space
|
||||||
|
return dict_character
|
||||||
|
|
||||||
|
def get_text(self, order):
|
||||||
|
char_list = [self.chars[text_id] for text_id in order]
|
||||||
|
return ''.join(char_list)
|
||||||
|
|
||||||
|
def decode(self, mat):
|
||||||
|
text_index = mat.detach().cpu().numpy().argmax(axis=1)
|
||||||
|
ignored_tokens = [0]
|
||||||
|
selection = np.ones(len(text_index), dtype=bool)
|
||||||
|
selection[1:] = text_index[1:] != text_index[:-1]
|
||||||
|
for ignored_token in ignored_tokens:
|
||||||
|
selection &= text_index != ignored_token
|
||||||
|
return text_index[selection], np.where(selection)[0]
|
||||||
|
|
||||||
|
def get_ctcloss(self, preds, gt_text, weight):
|
||||||
|
if not isinstance(weight, torch.Tensor):
|
||||||
|
weight = torch.tensor(weight).to(preds.device)
|
||||||
|
ctc_loss = torch.nn.CTCLoss(reduction='none')
|
||||||
|
log_probs = preds.log_softmax(dim=2).permute(1, 0, 2) # NTC-->TNC
|
||||||
|
targets = []
|
||||||
|
target_lengths = []
|
||||||
|
for t in gt_text:
|
||||||
|
targets += [self.char2id.get(i, len(self.chars)-1) for i in t]
|
||||||
|
target_lengths += [len(t)]
|
||||||
|
targets = torch.tensor(targets).to(preds.device)
|
||||||
|
target_lengths = torch.tensor(target_lengths).to(preds.device)
|
||||||
|
input_lengths = torch.tensor([log_probs.shape[0]]*(log_probs.shape[1])).to(preds.device)
|
||||||
|
loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
|
||||||
|
loss = loss / input_lengths * weight
|
||||||
|
return loss
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
rec_model_dir = ocr_model_path
|
||||||
|
predictor = create_predictor(rec_model_dir)
|
||||||
|
args = edict()
|
||||||
|
args.rec_image_shape = "3, 48, 320"
|
||||||
|
args.rec_char_dict_path = ocr_txt_path
|
||||||
|
args.rec_batch_num = 6
|
||||||
|
text_recognizer = TextRecognizer(args, predictor)
|
||||||
|
image_dir = './test_imgs_cn'
|
||||||
|
gt_text = ['韩国小馆']*14
|
||||||
|
|
||||||
|
image_file_list = get_image_file_list(image_dir)
|
||||||
|
valid_image_file_list = []
|
||||||
|
img_list = []
|
||||||
|
|
||||||
|
for image_file in image_file_list:
|
||||||
|
img = cv2.imread(image_file)
|
||||||
|
if img is None:
|
||||||
|
print("error in loading image:{}".format(image_file))
|
||||||
|
continue
|
||||||
|
valid_image_file_list.append(image_file)
|
||||||
|
img_list.append(torch.from_numpy(img).permute(2, 0, 1).float())
|
||||||
|
try:
|
||||||
|
tic = time.time()
|
||||||
|
times = []
|
||||||
|
for i in range(10):
|
||||||
|
preds, _ = text_recognizer.pred_imglist(img_list) # get text
|
||||||
|
preds_all = preds.softmax(dim=2)
|
||||||
|
times += [(time.time()-tic)*1000.]
|
||||||
|
tic = time.time()
|
||||||
|
print(times)
|
||||||
|
print(np.mean(times[1:]) / len(preds_all))
|
||||||
|
weight = np.ones(len(gt_text))
|
||||||
|
loss = text_recognizer.get_ctcloss(preds, gt_text, weight)
|
||||||
|
for i in range(len(valid_image_file_list)):
|
||||||
|
pred = preds_all[i]
|
||||||
|
order, idx = text_recognizer.decode(pred)
|
||||||
|
text = text_recognizer.get_text(order)
|
||||||
|
print(f'{valid_image_file_list[i]}: pred/gt="{text}"/"{gt_text[i]}", loss={loss[i]:.2f}')
|
||||||
|
except Exception as E:
|
||||||
|
print(traceback.format_exc(), E)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
0
AnyText/AnyText_scripts/ldm/data/__init__.py
Normal file
24
AnyText/AnyText_scripts/ldm/data/util.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
from ..modules.midas.api import load_midas_transform
|
||||||
|
|
||||||
|
|
||||||
|
class AddMiDaS(object):
|
||||||
|
def __init__(self, model_type):
|
||||||
|
super().__init__()
|
||||||
|
self.transform = load_midas_transform(model_type)
|
||||||
|
|
||||||
|
def pt2np(self, x):
|
||||||
|
x = ((x + 1.0) * .5).detach().cpu().numpy()
|
||||||
|
return x
|
||||||
|
|
||||||
|
def np2pt(self, x):
|
||||||
|
x = torch.from_numpy(x) * 2 - 1.
|
||||||
|
return x
|
||||||
|
|
||||||
|
def __call__(self, sample):
|
||||||
|
# sample['jpg'] is tensor hwc in [-1, 1] at this point
|
||||||
|
x = self.pt2np(sample['jpg'])
|
||||||
|
x = self.transform({"image": x})["image"]
|
||||||
|
sample['midas_in'] = x
|
||||||
|
return sample
|
219
AnyText/AnyText_scripts/ldm/models/autoencoder.py
Normal file
@ -0,0 +1,219 @@
|
|||||||
|
import torch
|
||||||
|
import pytorch_lightning as pl
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
from ..modules.diffusionmodules.model import Encoder, Decoder
|
||||||
|
from ..modules.distributions.distributions import DiagonalGaussianDistribution
|
||||||
|
|
||||||
|
from ..util import instantiate_from_config
|
||||||
|
from ..modules.ema import LitEma
|
||||||
|
|
||||||
|
|
||||||
|
class AutoencoderKL(pl.LightningModule):
|
||||||
|
def __init__(self,
|
||||||
|
ddconfig,
|
||||||
|
lossconfig,
|
||||||
|
embed_dim,
|
||||||
|
ckpt_path=None,
|
||||||
|
ignore_keys=[],
|
||||||
|
image_key="image",
|
||||||
|
colorize_nlabels=None,
|
||||||
|
monitor=None,
|
||||||
|
ema_decay=None,
|
||||||
|
learn_logvar=False
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.learn_logvar = learn_logvar
|
||||||
|
self.image_key = image_key
|
||||||
|
self.encoder = Encoder(**ddconfig)
|
||||||
|
self.decoder = Decoder(**ddconfig)
|
||||||
|
self.loss = instantiate_from_config(lossconfig)
|
||||||
|
assert ddconfig["double_z"]
|
||||||
|
self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
|
||||||
|
self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
|
||||||
|
self.embed_dim = embed_dim
|
||||||
|
if colorize_nlabels is not None:
|
||||||
|
assert type(colorize_nlabels)==int
|
||||||
|
self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
|
||||||
|
if monitor is not None:
|
||||||
|
self.monitor = monitor
|
||||||
|
|
||||||
|
self.use_ema = ema_decay is not None
|
||||||
|
if self.use_ema:
|
||||||
|
self.ema_decay = ema_decay
|
||||||
|
assert 0. < ema_decay < 1.
|
||||||
|
self.model_ema = LitEma(self, decay=ema_decay)
|
||||||
|
print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
|
||||||
|
|
||||||
|
if ckpt_path is not None:
|
||||||
|
self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
|
||||||
|
|
||||||
|
def init_from_ckpt(self, path, ignore_keys=list()):
|
||||||
|
sd = torch.load(path, map_location="cpu")["state_dict"]
|
||||||
|
keys = list(sd.keys())
|
||||||
|
for k in keys:
|
||||||
|
for ik in ignore_keys:
|
||||||
|
if k.startswith(ik):
|
||||||
|
print("Deleting key {} from state_dict.".format(k))
|
||||||
|
del sd[k]
|
||||||
|
self.load_state_dict(sd, strict=False)
|
||||||
|
print(f"Restored from {path}")
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def ema_scope(self, context=None):
|
||||||
|
if self.use_ema:
|
||||||
|
self.model_ema.store(self.parameters())
|
||||||
|
self.model_ema.copy_to(self)
|
||||||
|
if context is not None:
|
||||||
|
print(f"{context}: Switched to EMA weights")
|
||||||
|
try:
|
||||||
|
yield None
|
||||||
|
finally:
|
||||||
|
if self.use_ema:
|
||||||
|
self.model_ema.restore(self.parameters())
|
||||||
|
if context is not None:
|
||||||
|
print(f"{context}: Restored training weights")
|
||||||
|
|
||||||
|
def on_train_batch_end(self, *args, **kwargs):
|
||||||
|
if self.use_ema:
|
||||||
|
self.model_ema(self)
|
||||||
|
|
||||||
|
def encode(self, x):
|
||||||
|
h = self.encoder(x)
|
||||||
|
moments = self.quant_conv(h)
|
||||||
|
posterior = DiagonalGaussianDistribution(moments)
|
||||||
|
return posterior
|
||||||
|
|
||||||
|
def decode(self, z):
|
||||||
|
z = self.post_quant_conv(z)
|
||||||
|
dec = self.decoder(z)
|
||||||
|
return dec
|
||||||
|
|
||||||
|
def forward(self, input, sample_posterior=True):
|
||||||
|
posterior = self.encode(input)
|
||||||
|
if sample_posterior:
|
||||||
|
z = posterior.sample()
|
||||||
|
else:
|
||||||
|
z = posterior.mode()
|
||||||
|
dec = self.decode(z)
|
||||||
|
return dec, posterior
|
||||||
|
|
||||||
|
def get_input(self, batch, k):
|
||||||
|
x = batch[k]
|
||||||
|
if len(x.shape) == 3:
|
||||||
|
x = x[..., None]
|
||||||
|
x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
|
||||||
|
return x
|
||||||
|
|
||||||
|
def training_step(self, batch, batch_idx, optimizer_idx):
|
||||||
|
inputs = self.get_input(batch, self.image_key)
|
||||||
|
reconstructions, posterior = self(inputs)
|
||||||
|
|
||||||
|
if optimizer_idx == 0:
|
||||||
|
# train encoder+decoder+logvar
|
||||||
|
aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
|
||||||
|
last_layer=self.get_last_layer(), split="train")
|
||||||
|
self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
|
||||||
|
self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
|
||||||
|
return aeloss
|
||||||
|
|
||||||
|
if optimizer_idx == 1:
|
||||||
|
# train the discriminator
|
||||||
|
discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
|
||||||
|
last_layer=self.get_last_layer(), split="train")
|
||||||
|
|
||||||
|
self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
|
||||||
|
self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
|
||||||
|
return discloss
|
||||||
|
|
||||||
|
def validation_step(self, batch, batch_idx):
|
||||||
|
log_dict = self._validation_step(batch, batch_idx)
|
||||||
|
with self.ema_scope():
|
||||||
|
log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
|
||||||
|
return log_dict
|
||||||
|
|
||||||
|
def _validation_step(self, batch, batch_idx, postfix=""):
|
||||||
|
inputs = self.get_input(batch, self.image_key)
|
||||||
|
reconstructions, posterior = self(inputs)
|
||||||
|
aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
|
||||||
|
last_layer=self.get_last_layer(), split="val"+postfix)
|
||||||
|
|
||||||
|
discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
|
||||||
|
last_layer=self.get_last_layer(), split="val"+postfix)
|
||||||
|
|
||||||
|
self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"])
|
||||||
|
self.log_dict(log_dict_ae)
|
||||||
|
self.log_dict(log_dict_disc)
|
||||||
|
return self.log_dict
|
||||||
|
|
||||||
|
def configure_optimizers(self):
|
||||||
|
lr = self.learning_rate
|
||||||
|
ae_params_list = list(self.encoder.parameters()) + list(self.decoder.parameters()) + list(
|
||||||
|
self.quant_conv.parameters()) + list(self.post_quant_conv.parameters())
|
||||||
|
if self.learn_logvar:
|
||||||
|
print(f"{self.__class__.__name__}: Learning logvar")
|
||||||
|
ae_params_list.append(self.loss.logvar)
|
||||||
|
opt_ae = torch.optim.Adam(ae_params_list,
|
||||||
|
lr=lr, betas=(0.5, 0.9))
|
||||||
|
opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
|
||||||
|
lr=lr, betas=(0.5, 0.9))
|
||||||
|
return [opt_ae, opt_disc], []
|
||||||
|
|
||||||
|
def get_last_layer(self):
|
||||||
|
return self.decoder.conv_out.weight
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def log_images(self, batch, only_inputs=False, log_ema=False, **kwargs):
|
||||||
|
log = dict()
|
||||||
|
x = self.get_input(batch, self.image_key)
|
||||||
|
x = x.to(self.device)
|
||||||
|
if not only_inputs:
|
||||||
|
xrec, posterior = self(x)
|
||||||
|
if x.shape[1] > 3:
|
||||||
|
# colorize with random projection
|
||||||
|
assert xrec.shape[1] > 3
|
||||||
|
x = self.to_rgb(x)
|
||||||
|
xrec = self.to_rgb(xrec)
|
||||||
|
log["samples"] = self.decode(torch.randn_like(posterior.sample()))
|
||||||
|
log["reconstructions"] = xrec
|
||||||
|
if log_ema or self.use_ema:
|
||||||
|
with self.ema_scope():
|
||||||
|
xrec_ema, posterior_ema = self(x)
|
||||||
|
if x.shape[1] > 3:
|
||||||
|
# colorize with random projection
|
||||||
|
assert xrec_ema.shape[1] > 3
|
||||||
|
xrec_ema = self.to_rgb(xrec_ema)
|
||||||
|
log["samples_ema"] = self.decode(torch.randn_like(posterior_ema.sample()))
|
||||||
|
log["reconstructions_ema"] = xrec_ema
|
||||||
|
log["inputs"] = x
|
||||||
|
return log
|
||||||
|
|
||||||
|
def to_rgb(self, x):
|
||||||
|
assert self.image_key == "segmentation"
|
||||||
|
if not hasattr(self, "colorize"):
|
||||||
|
self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
|
||||||
|
x = F.conv2d(x, weight=self.colorize)
|
||||||
|
x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class IdentityFirstStage(torch.nn.Module):
|
||||||
|
def __init__(self, *args, vq_interface=False, **kwargs):
|
||||||
|
self.vq_interface = vq_interface
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def encode(self, x, *args, **kwargs):
|
||||||
|
return x
|
||||||
|
|
||||||
|
def decode(self, x, *args, **kwargs):
|
||||||
|
return x
|
||||||
|
|
||||||
|
def quantize(self, x, *args, **kwargs):
|
||||||
|
if self.vq_interface:
|
||||||
|
return x, None, [None, None, None]
|
||||||
|
return x
|
||||||
|
|
||||||
|
def forward(self, x, *args, **kwargs):
|
||||||
|
return x
|
||||||
|
|
354
AnyText/AnyText_scripts/ldm/models/diffusion/ddim.py
Normal file
@ -0,0 +1,354 @@
|
|||||||
|
"""SAMPLING ONLY."""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from ...modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, extract_into_tensor
|
||||||
|
|
||||||
|
|
||||||
|
class DDIMSampler(object):
|
||||||
|
def __init__(self, model, schedule="linear", **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.model = model
|
||||||
|
self.ddpm_num_timesteps = model.num_timesteps
|
||||||
|
self.schedule = schedule
|
||||||
|
|
||||||
|
def register_buffer(self, name, attr):
|
||||||
|
if type(attr) == torch.Tensor:
|
||||||
|
if attr.device != torch.device("cuda"):
|
||||||
|
attr = attr.to(torch.device("cuda"))
|
||||||
|
setattr(self, name, attr)
|
||||||
|
|
||||||
|
def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
|
||||||
|
self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
|
||||||
|
num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
|
||||||
|
alphas_cumprod = self.model.alphas_cumprod
|
||||||
|
assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
|
||||||
|
to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
|
||||||
|
|
||||||
|
self.register_buffer('betas', to_torch(self.model.betas))
|
||||||
|
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
|
||||||
|
self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
|
||||||
|
|
||||||
|
# calculations for diffusion q(x_t | x_{t-1}) and others
|
||||||
|
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
|
||||||
|
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
|
||||||
|
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
|
||||||
|
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
|
||||||
|
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
|
||||||
|
|
||||||
|
# ddim sampling parameters
|
||||||
|
ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
|
||||||
|
ddim_timesteps=self.ddim_timesteps,
|
||||||
|
eta=ddim_eta,verbose=verbose)
|
||||||
|
self.register_buffer('ddim_sigmas', ddim_sigmas)
|
||||||
|
self.register_buffer('ddim_alphas', ddim_alphas)
|
||||||
|
self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
|
||||||
|
self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
|
||||||
|
sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
|
||||||
|
(1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
|
||||||
|
1 - self.alphas_cumprod / self.alphas_cumprod_prev))
|
||||||
|
self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def sample(self,
|
||||||
|
S,
|
||||||
|
batch_size,
|
||||||
|
shape,
|
||||||
|
conditioning=None,
|
||||||
|
callback=None,
|
||||||
|
normals_sequence=None,
|
||||||
|
img_callback=None,
|
||||||
|
quantize_x0=False,
|
||||||
|
eta=0.,
|
||||||
|
mask=None,
|
||||||
|
x0=None,
|
||||||
|
temperature=1.,
|
||||||
|
noise_dropout=0.,
|
||||||
|
score_corrector=None,
|
||||||
|
corrector_kwargs=None,
|
||||||
|
verbose=True,
|
||||||
|
x_T=None,
|
||||||
|
log_every_t=100,
|
||||||
|
unconditional_guidance_scale=1.,
|
||||||
|
unconditional_conditioning=None, # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
|
||||||
|
dynamic_threshold=None,
|
||||||
|
ucg_schedule=None,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
if conditioning is not None:
|
||||||
|
if isinstance(conditioning, dict):
|
||||||
|
ctmp = conditioning[list(conditioning.keys())[0]]
|
||||||
|
while isinstance(ctmp, list): ctmp = ctmp[0]
|
||||||
|
cbs = ctmp.shape[0]
|
||||||
|
# cbs = len(ctmp[0])
|
||||||
|
if cbs != batch_size:
|
||||||
|
print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
|
||||||
|
|
||||||
|
elif isinstance(conditioning, list):
|
||||||
|
for ctmp in conditioning:
|
||||||
|
if ctmp.shape[0] != batch_size:
|
||||||
|
print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
|
||||||
|
|
||||||
|
else:
|
||||||
|
if conditioning.shape[0] != batch_size:
|
||||||
|
print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
|
||||||
|
|
||||||
|
self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
|
||||||
|
# sampling
|
||||||
|
C, H, W = shape
|
||||||
|
size = (batch_size, C, H, W)
|
||||||
|
print(f'Data shape for DDIM sampling is {size}, eta {eta}')
|
||||||
|
|
||||||
|
samples, intermediates = self.ddim_sampling(conditioning, size,
|
||||||
|
callback=callback,
|
||||||
|
img_callback=img_callback,
|
||||||
|
quantize_denoised=quantize_x0,
|
||||||
|
mask=mask, x0=x0,
|
||||||
|
ddim_use_original_steps=False,
|
||||||
|
noise_dropout=noise_dropout,
|
||||||
|
temperature=temperature,
|
||||||
|
score_corrector=score_corrector,
|
||||||
|
corrector_kwargs=corrector_kwargs,
|
||||||
|
x_T=x_T,
|
||||||
|
log_every_t=log_every_t,
|
||||||
|
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||||
|
unconditional_conditioning=unconditional_conditioning,
|
||||||
|
dynamic_threshold=dynamic_threshold,
|
||||||
|
ucg_schedule=ucg_schedule
|
||||||
|
)
|
||||||
|
return samples, intermediates
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def ddim_sampling(self, cond, shape,
|
||||||
|
x_T=None, ddim_use_original_steps=False,
|
||||||
|
callback=None, timesteps=None, quantize_denoised=False,
|
||||||
|
mask=None, x0=None, img_callback=None, log_every_t=100,
|
||||||
|
temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
|
||||||
|
unconditional_guidance_scale=1., unconditional_conditioning=None, dynamic_threshold=None,
|
||||||
|
ucg_schedule=None):
|
||||||
|
device = self.model.betas.device
|
||||||
|
b = shape[0]
|
||||||
|
if x_T is None:
|
||||||
|
img = torch.randn(shape, device=device)
|
||||||
|
else:
|
||||||
|
img = x_T
|
||||||
|
|
||||||
|
if timesteps is None:
|
||||||
|
timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
|
||||||
|
elif timesteps is not None and not ddim_use_original_steps:
|
||||||
|
subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
|
||||||
|
timesteps = self.ddim_timesteps[:subset_end]
|
||||||
|
|
||||||
|
intermediates = {'x_inter': [img], 'pred_x0': [img], "index": [10000]}
|
||||||
|
time_range = reversed(range(0, timesteps)) if ddim_use_original_steps else np.flip(timesteps)
|
||||||
|
total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
|
||||||
|
print(f"Running DDIM Sampling with {total_steps} timesteps")
|
||||||
|
|
||||||
|
iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
|
||||||
|
|
||||||
|
for i, step in enumerate(iterator):
|
||||||
|
index = total_steps - i - 1
|
||||||
|
ts = torch.full((b,), step, device=device, dtype=torch.long)
|
||||||
|
|
||||||
|
if mask is not None:
|
||||||
|
assert x0 is not None
|
||||||
|
img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
|
||||||
|
img = img_orig * mask + (1. - mask) * img
|
||||||
|
|
||||||
|
if ucg_schedule is not None:
|
||||||
|
assert len(ucg_schedule) == len(time_range)
|
||||||
|
unconditional_guidance_scale = ucg_schedule[i]
|
||||||
|
|
||||||
|
outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
|
||||||
|
quantize_denoised=quantize_denoised, temperature=temperature,
|
||||||
|
noise_dropout=noise_dropout, score_corrector=score_corrector,
|
||||||
|
corrector_kwargs=corrector_kwargs,
|
||||||
|
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||||
|
unconditional_conditioning=unconditional_conditioning,
|
||||||
|
dynamic_threshold=dynamic_threshold)
|
||||||
|
img, pred_x0 = outs
|
||||||
|
if callback:
|
||||||
|
callback(i)
|
||||||
|
if img_callback:
|
||||||
|
img_callback(pred_x0, i)
|
||||||
|
|
||||||
|
if index % log_every_t == 0 or index == total_steps - 1:
|
||||||
|
intermediates['x_inter'].append(img)
|
||||||
|
intermediates['pred_x0'].append(pred_x0)
|
||||||
|
intermediates['index'].append(index)
|
||||||
|
|
||||||
|
return img, intermediates
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
|
||||||
|
temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
|
||||||
|
unconditional_guidance_scale=1., unconditional_conditioning=None,
|
||||||
|
dynamic_threshold=None):
|
||||||
|
b, *_, device = *x.shape, x.device
|
||||||
|
|
||||||
|
if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
|
||||||
|
model_output = self.model.apply_model(x, t, c)
|
||||||
|
else:
|
||||||
|
x_in = torch.cat([x] * 2)
|
||||||
|
t_in = torch.cat([t] * 2)
|
||||||
|
if isinstance(c, dict):
|
||||||
|
assert isinstance(unconditional_conditioning, dict)
|
||||||
|
c_in = dict()
|
||||||
|
for k in c:
|
||||||
|
if isinstance(c[k], list):
|
||||||
|
c_in[k] = [torch.cat([
|
||||||
|
unconditional_conditioning[k][i],
|
||||||
|
c[k][i]]) for i in range(len(c[k]))]
|
||||||
|
elif isinstance(c[k], dict):
|
||||||
|
c_in[k] = dict()
|
||||||
|
for key in c[k]:
|
||||||
|
if isinstance(c[k][key], list):
|
||||||
|
if not isinstance(c[k][key][0], torch.Tensor):
|
||||||
|
continue
|
||||||
|
c_in[k][key] = [torch.cat([
|
||||||
|
unconditional_conditioning[k][key][i],
|
||||||
|
c[k][key][i]]) for i in range(len(c[k][key]))]
|
||||||
|
else:
|
||||||
|
c_in[k][key] = torch.cat([
|
||||||
|
unconditional_conditioning[k][key],
|
||||||
|
c[k][key]])
|
||||||
|
|
||||||
|
else:
|
||||||
|
c_in[k] = torch.cat([
|
||||||
|
unconditional_conditioning[k],
|
||||||
|
c[k]])
|
||||||
|
elif isinstance(c, list):
|
||||||
|
c_in = list()
|
||||||
|
assert isinstance(unconditional_conditioning, list)
|
||||||
|
for i in range(len(c)):
|
||||||
|
c_in.append(torch.cat([unconditional_conditioning[i], c[i]]))
|
||||||
|
else:
|
||||||
|
c_in = torch.cat([unconditional_conditioning, c])
|
||||||
|
model_uncond, model_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
|
||||||
|
model_output = model_uncond + unconditional_guidance_scale * (model_t - model_uncond)
|
||||||
|
|
||||||
|
if self.model.parameterization == "v":
|
||||||
|
e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
|
||||||
|
else:
|
||||||
|
e_t = model_output
|
||||||
|
|
||||||
|
if score_corrector is not None:
|
||||||
|
assert self.model.parameterization == "eps", 'not implemented'
|
||||||
|
e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
|
||||||
|
|
||||||
|
alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
|
||||||
|
alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
|
||||||
|
sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
|
||||||
|
sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
|
||||||
|
# select parameters corresponding to the currently considered timestep
|
||||||
|
a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
|
||||||
|
a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
|
||||||
|
sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
|
||||||
|
sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
|
||||||
|
|
||||||
|
# current prediction for x_0
|
||||||
|
if self.model.parameterization != "v":
|
||||||
|
pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
|
||||||
|
else:
|
||||||
|
pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
|
||||||
|
|
||||||
|
if quantize_denoised:
|
||||||
|
pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
|
||||||
|
|
||||||
|
if dynamic_threshold is not None:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
# direction pointing to x_t
|
||||||
|
dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
|
||||||
|
noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
|
||||||
|
if noise_dropout > 0.:
|
||||||
|
noise = torch.nn.functional.dropout(noise, p=noise_dropout)
|
||||||
|
x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
|
||||||
|
return x_prev, pred_x0
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def encode(self, x0, c, t_enc, use_original_steps=False, return_intermediates=None,
|
||||||
|
unconditional_guidance_scale=1.0, unconditional_conditioning=None, callback=None):
|
||||||
|
num_reference_steps = self.ddpm_num_timesteps if use_original_steps else self.ddim_timesteps.shape[0]
|
||||||
|
|
||||||
|
assert t_enc <= num_reference_steps
|
||||||
|
num_steps = t_enc
|
||||||
|
|
||||||
|
if use_original_steps:
|
||||||
|
alphas_next = self.alphas_cumprod[:num_steps]
|
||||||
|
alphas = self.alphas_cumprod_prev[:num_steps]
|
||||||
|
else:
|
||||||
|
alphas_next = self.ddim_alphas[:num_steps]
|
||||||
|
alphas = torch.tensor(self.ddim_alphas_prev[:num_steps])
|
||||||
|
|
||||||
|
x_next = x0
|
||||||
|
intermediates = []
|
||||||
|
inter_steps = []
|
||||||
|
for i in tqdm(range(num_steps), desc='Encoding Image'):
|
||||||
|
t = torch.full((x0.shape[0],), i, device=self.model.device, dtype=torch.long)
|
||||||
|
if unconditional_guidance_scale == 1.:
|
||||||
|
noise_pred = self.model.apply_model(x_next, t, c)
|
||||||
|
else:
|
||||||
|
assert unconditional_conditioning is not None
|
||||||
|
e_t_uncond, noise_pred = torch.chunk(
|
||||||
|
self.model.apply_model(torch.cat((x_next, x_next)), torch.cat((t, t)),
|
||||||
|
torch.cat((unconditional_conditioning, c))), 2)
|
||||||
|
noise_pred = e_t_uncond + unconditional_guidance_scale * (noise_pred - e_t_uncond)
|
||||||
|
|
||||||
|
xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next
|
||||||
|
weighted_noise_pred = alphas_next[i].sqrt() * (
|
||||||
|
(1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt()) * noise_pred
|
||||||
|
x_next = xt_weighted + weighted_noise_pred
|
||||||
|
if return_intermediates and i % (
|
||||||
|
num_steps // return_intermediates) == 0 and i < num_steps - 1:
|
||||||
|
intermediates.append(x_next)
|
||||||
|
inter_steps.append(i)
|
||||||
|
elif return_intermediates and i >= num_steps - 2:
|
||||||
|
intermediates.append(x_next)
|
||||||
|
inter_steps.append(i)
|
||||||
|
if callback: callback(i)
|
||||||
|
|
||||||
|
out = {'x_encoded': x_next, 'intermediate_steps': inter_steps}
|
||||||
|
if return_intermediates:
|
||||||
|
out.update({'intermediates': intermediates})
|
||||||
|
return x_next, out
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
|
||||||
|
# fast, but does not allow for exact reconstruction
|
||||||
|
# t serves as an index to gather the correct alphas
|
||||||
|
if use_original_steps:
|
||||||
|
sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
|
||||||
|
sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
|
||||||
|
else:
|
||||||
|
sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
|
||||||
|
sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
|
||||||
|
|
||||||
|
if noise is None:
|
||||||
|
noise = torch.randn_like(x0)
|
||||||
|
return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
|
||||||
|
extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
|
||||||
|
use_original_steps=False, callback=None):
|
||||||
|
|
||||||
|
timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
|
||||||
|
timesteps = timesteps[:t_start]
|
||||||
|
|
||||||
|
time_range = np.flip(timesteps)
|
||||||
|
total_steps = timesteps.shape[0]
|
||||||
|
print(f"Running DDIM Sampling with {total_steps} timesteps")
|
||||||
|
|
||||||
|
iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
|
||||||
|
x_dec = x_latent
|
||||||
|
for i, step in enumerate(iterator):
|
||||||
|
index = total_steps - i - 1
|
||||||
|
ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
|
||||||
|
x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
|
||||||
|
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||||
|
unconditional_conditioning=unconditional_conditioning)
|
||||||
|
if callback: callback(i)
|
||||||
|
return x_dec
|
1963
AnyText/AnyText_scripts/ldm/models/diffusion/ddpm.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
from .sampler import DPMSolverSampler
|
@ -0,0 +1,87 @@
|
|||||||
|
"""SAMPLING ONLY."""
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver
|
||||||
|
|
||||||
|
|
||||||
|
MODEL_TYPES = {
|
||||||
|
"eps": "noise",
|
||||||
|
"v": "v"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class DPMSolverSampler(object):
|
||||||
|
def __init__(self, model, **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.model = model
|
||||||
|
to_torch = lambda x: x.clone().detach().to(torch.float32).to(model.device)
|
||||||
|
self.register_buffer('alphas_cumprod', to_torch(model.alphas_cumprod))
|
||||||
|
|
||||||
|
def register_buffer(self, name, attr):
|
||||||
|
if type(attr) == torch.Tensor:
|
||||||
|
if attr.device != torch.device("cuda"):
|
||||||
|
attr = attr.to(torch.device("cuda"))
|
||||||
|
setattr(self, name, attr)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def sample(self,
|
||||||
|
S,
|
||||||
|
batch_size,
|
||||||
|
shape,
|
||||||
|
conditioning=None,
|
||||||
|
callback=None,
|
||||||
|
normals_sequence=None,
|
||||||
|
img_callback=None,
|
||||||
|
quantize_x0=False,
|
||||||
|
eta=0.,
|
||||||
|
mask=None,
|
||||||
|
x0=None,
|
||||||
|
temperature=1.,
|
||||||
|
noise_dropout=0.,
|
||||||
|
score_corrector=None,
|
||||||
|
corrector_kwargs=None,
|
||||||
|
verbose=True,
|
||||||
|
x_T=None,
|
||||||
|
log_every_t=100,
|
||||||
|
unconditional_guidance_scale=1.,
|
||||||
|
unconditional_conditioning=None,
|
||||||
|
# this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
if conditioning is not None:
|
||||||
|
if isinstance(conditioning, dict):
|
||||||
|
cbs = conditioning[list(conditioning.keys())[0]].shape[0]
|
||||||
|
if cbs != batch_size:
|
||||||
|
print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
|
||||||
|
else:
|
||||||
|
if conditioning.shape[0] != batch_size:
|
||||||
|
print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
|
||||||
|
|
||||||
|
# sampling
|
||||||
|
C, H, W = shape
|
||||||
|
size = (batch_size, C, H, W)
|
||||||
|
|
||||||
|
print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}')
|
||||||
|
|
||||||
|
device = self.model.betas.device
|
||||||
|
if x_T is None:
|
||||||
|
img = torch.randn(size, device=device)
|
||||||
|
else:
|
||||||
|
img = x_T
|
||||||
|
|
||||||
|
ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod)
|
||||||
|
|
||||||
|
model_fn = model_wrapper(
|
||||||
|
lambda x, t, c: self.model.apply_model(x, t, c),
|
||||||
|
ns,
|
||||||
|
model_type=MODEL_TYPES[self.model.parameterization],
|
||||||
|
guidance_type="classifier-free",
|
||||||
|
condition=conditioning,
|
||||||
|
unconditional_condition=unconditional_conditioning,
|
||||||
|
guidance_scale=unconditional_guidance_scale,
|
||||||
|
)
|
||||||
|
|
||||||
|
dpm_solver = DPM_Solver(model_fn, ns, predict_x0=True, thresholding=False)
|
||||||
|
x = dpm_solver.sample(img, steps=S, skip_type="time_uniform", method="multistep", order=2, lower_order_final=True)
|
||||||
|
|
||||||
|
return x.to(device), None
|
210
AnyText/AnyText_scripts/ldm/models/diffusion/ocr_recog/RNN.py
Normal file
@ -0,0 +1,210 @@
|
|||||||
|
from torch import nn
|
||||||
|
import torch
|
||||||
|
from .RecSVTR import Block
|
||||||
|
|
||||||
|
class Swish(nn.Module):
|
||||||
|
def __int__(self):
|
||||||
|
super(Swish, self).__int__()
|
||||||
|
|
||||||
|
def forward(self,x):
|
||||||
|
return x*torch.sigmoid(x)
|
||||||
|
|
||||||
|
class Im2Im(nn.Module):
|
||||||
|
def __init__(self, in_channels, **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.out_channels = in_channels
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return x
|
||||||
|
|
||||||
|
class Im2Seq(nn.Module):
|
||||||
|
def __init__(self, in_channels, **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.out_channels = in_channels
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
B, C, H, W = x.shape
|
||||||
|
# assert H == 1
|
||||||
|
x = x.reshape(B, C, H * W)
|
||||||
|
x = x.permute((0, 2, 1))
|
||||||
|
return x
|
||||||
|
|
||||||
|
class EncoderWithRNN(nn.Module):
|
||||||
|
def __init__(self, in_channels,**kwargs):
|
||||||
|
super(EncoderWithRNN, self).__init__()
|
||||||
|
hidden_size = kwargs.get('hidden_size', 256)
|
||||||
|
self.out_channels = hidden_size * 2
|
||||||
|
self.lstm = nn.LSTM(in_channels, hidden_size, bidirectional=True, num_layers=2,batch_first=True)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
self.lstm.flatten_parameters()
|
||||||
|
x, _ = self.lstm(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
class SequenceEncoder(nn.Module):
|
||||||
|
def __init__(self, in_channels, encoder_type='rnn', **kwargs):
|
||||||
|
super(SequenceEncoder, self).__init__()
|
||||||
|
self.encoder_reshape = Im2Seq(in_channels)
|
||||||
|
self.out_channels = self.encoder_reshape.out_channels
|
||||||
|
self.encoder_type = encoder_type
|
||||||
|
if encoder_type == 'reshape':
|
||||||
|
self.only_reshape = True
|
||||||
|
else:
|
||||||
|
support_encoder_dict = {
|
||||||
|
'reshape': Im2Seq,
|
||||||
|
'rnn': EncoderWithRNN,
|
||||||
|
'svtr': EncoderWithSVTR
|
||||||
|
}
|
||||||
|
assert encoder_type in support_encoder_dict, '{} must in {}'.format(
|
||||||
|
encoder_type, support_encoder_dict.keys())
|
||||||
|
|
||||||
|
self.encoder = support_encoder_dict[encoder_type](
|
||||||
|
self.encoder_reshape.out_channels,**kwargs)
|
||||||
|
self.out_channels = self.encoder.out_channels
|
||||||
|
self.only_reshape = False
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.encoder_type != 'svtr':
|
||||||
|
x = self.encoder_reshape(x)
|
||||||
|
if not self.only_reshape:
|
||||||
|
x = self.encoder(x)
|
||||||
|
return x
|
||||||
|
else:
|
||||||
|
x = self.encoder(x)
|
||||||
|
x = self.encoder_reshape(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
class ConvBNLayer(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias_attr=False,
|
||||||
|
groups=1,
|
||||||
|
act=nn.GELU):
|
||||||
|
super().__init__()
|
||||||
|
self.conv = nn.Conv2d(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
kernel_size=kernel_size,
|
||||||
|
stride=stride,
|
||||||
|
padding=padding,
|
||||||
|
groups=groups,
|
||||||
|
# weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()),
|
||||||
|
bias=bias_attr)
|
||||||
|
self.norm = nn.BatchNorm2d(out_channels)
|
||||||
|
self.act = Swish()
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
out = self.conv(inputs)
|
||||||
|
out = self.norm(out)
|
||||||
|
out = self.act(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class EncoderWithSVTR(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_channels,
|
||||||
|
dims=64, # XS
|
||||||
|
depth=2,
|
||||||
|
hidden_dims=120,
|
||||||
|
use_guide=False,
|
||||||
|
num_heads=8,
|
||||||
|
qkv_bias=True,
|
||||||
|
mlp_ratio=2.0,
|
||||||
|
drop_rate=0.1,
|
||||||
|
attn_drop_rate=0.1,
|
||||||
|
drop_path=0.,
|
||||||
|
qk_scale=None):
|
||||||
|
super(EncoderWithSVTR, self).__init__()
|
||||||
|
self.depth = depth
|
||||||
|
self.use_guide = use_guide
|
||||||
|
self.conv1 = ConvBNLayer(
|
||||||
|
in_channels, in_channels // 8, padding=1, act='swish')
|
||||||
|
self.conv2 = ConvBNLayer(
|
||||||
|
in_channels // 8, hidden_dims, kernel_size=1, act='swish')
|
||||||
|
|
||||||
|
self.svtr_block = nn.ModuleList([
|
||||||
|
Block(
|
||||||
|
dim=hidden_dims,
|
||||||
|
num_heads=num_heads,
|
||||||
|
mixer='Global',
|
||||||
|
HW=None,
|
||||||
|
mlp_ratio=mlp_ratio,
|
||||||
|
qkv_bias=qkv_bias,
|
||||||
|
qk_scale=qk_scale,
|
||||||
|
drop=drop_rate,
|
||||||
|
act_layer='swish',
|
||||||
|
attn_drop=attn_drop_rate,
|
||||||
|
drop_path=drop_path,
|
||||||
|
norm_layer='nn.LayerNorm',
|
||||||
|
epsilon=1e-05,
|
||||||
|
prenorm=False) for i in range(depth)
|
||||||
|
])
|
||||||
|
self.norm = nn.LayerNorm(hidden_dims, eps=1e-6)
|
||||||
|
self.conv3 = ConvBNLayer(
|
||||||
|
hidden_dims, in_channels, kernel_size=1, act='swish')
|
||||||
|
# last conv-nxn, the input is concat of input tensor and conv3 output tensor
|
||||||
|
self.conv4 = ConvBNLayer(
|
||||||
|
2 * in_channels, in_channels // 8, padding=1, act='swish')
|
||||||
|
|
||||||
|
self.conv1x1 = ConvBNLayer(
|
||||||
|
in_channels // 8, dims, kernel_size=1, act='swish')
|
||||||
|
self.out_channels = dims
|
||||||
|
self.apply(self._init_weights)
|
||||||
|
|
||||||
|
def _init_weights(self, m):
|
||||||
|
# weight initialization
|
||||||
|
if isinstance(m, nn.Conv2d):
|
||||||
|
nn.init.kaiming_normal_(m.weight, mode='fan_out')
|
||||||
|
if m.bias is not None:
|
||||||
|
nn.init.zeros_(m.bias)
|
||||||
|
elif isinstance(m, nn.BatchNorm2d):
|
||||||
|
nn.init.ones_(m.weight)
|
||||||
|
nn.init.zeros_(m.bias)
|
||||||
|
elif isinstance(m, nn.Linear):
|
||||||
|
nn.init.normal_(m.weight, 0, 0.01)
|
||||||
|
if m.bias is not None:
|
||||||
|
nn.init.zeros_(m.bias)
|
||||||
|
elif isinstance(m, nn.ConvTranspose2d):
|
||||||
|
nn.init.kaiming_normal_(m.weight, mode='fan_out')
|
||||||
|
if m.bias is not None:
|
||||||
|
nn.init.zeros_(m.bias)
|
||||||
|
elif isinstance(m, nn.LayerNorm):
|
||||||
|
nn.init.ones_(m.weight)
|
||||||
|
nn.init.zeros_(m.bias)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# for use guide
|
||||||
|
if self.use_guide:
|
||||||
|
z = x.clone()
|
||||||
|
z.stop_gradient = True
|
||||||
|
else:
|
||||||
|
z = x
|
||||||
|
# for short cut
|
||||||
|
h = z
|
||||||
|
# reduce dim
|
||||||
|
z = self.conv1(z)
|
||||||
|
z = self.conv2(z)
|
||||||
|
# SVTR global block
|
||||||
|
B, C, H, W = z.shape
|
||||||
|
z = z.flatten(2).permute(0, 2, 1)
|
||||||
|
|
||||||
|
for blk in self.svtr_block:
|
||||||
|
z = blk(z)
|
||||||
|
|
||||||
|
z = self.norm(z)
|
||||||
|
# last stage
|
||||||
|
z = z.reshape([-1, H, W, C]).permute(0, 3, 1, 2)
|
||||||
|
z = self.conv3(z)
|
||||||
|
z = torch.cat((h, z), dim=1)
|
||||||
|
z = self.conv1x1(self.conv4(z))
|
||||||
|
|
||||||
|
return z
|
||||||
|
|
||||||
|
if __name__=="__main__":
|
||||||
|
svtrRNN = EncoderWithSVTR(56)
|
||||||
|
print(svtrRNN)
|
@ -0,0 +1,48 @@
|
|||||||
|
from torch import nn
|
||||||
|
|
||||||
|
|
||||||
|
class CTCHead(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
out_channels=6625,
|
||||||
|
fc_decay=0.0004,
|
||||||
|
mid_channels=None,
|
||||||
|
return_feats=False,
|
||||||
|
**kwargs):
|
||||||
|
super(CTCHead, self).__init__()
|
||||||
|
if mid_channels is None:
|
||||||
|
self.fc = nn.Linear(
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
bias=True,)
|
||||||
|
else:
|
||||||
|
self.fc1 = nn.Linear(
|
||||||
|
in_channels,
|
||||||
|
mid_channels,
|
||||||
|
bias=True,
|
||||||
|
)
|
||||||
|
self.fc2 = nn.Linear(
|
||||||
|
mid_channels,
|
||||||
|
out_channels,
|
||||||
|
bias=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.out_channels = out_channels
|
||||||
|
self.mid_channels = mid_channels
|
||||||
|
self.return_feats = return_feats
|
||||||
|
|
||||||
|
def forward(self, x, labels=None):
|
||||||
|
if self.mid_channels is None:
|
||||||
|
predicts = self.fc(x)
|
||||||
|
else:
|
||||||
|
x = self.fc1(x)
|
||||||
|
predicts = self.fc2(x)
|
||||||
|
|
||||||
|
if self.return_feats:
|
||||||
|
result = dict()
|
||||||
|
result['ctc'] = predicts
|
||||||
|
result['ctc_neck'] = x
|
||||||
|
else:
|
||||||
|
result = predicts
|
||||||
|
|
||||||
|
return result
|
@ -0,0 +1,45 @@
|
|||||||
|
from torch import nn
|
||||||
|
from .RNN import SequenceEncoder, Im2Seq, Im2Im
|
||||||
|
from .RecMv1_enhance import MobileNetV1Enhance
|
||||||
|
|
||||||
|
from .RecCTCHead import CTCHead
|
||||||
|
|
||||||
|
backbone_dict = {"MobileNetV1Enhance":MobileNetV1Enhance}
|
||||||
|
neck_dict = {'SequenceEncoder': SequenceEncoder, 'Im2Seq': Im2Seq,'None':Im2Im}
|
||||||
|
head_dict = {'CTCHead':CTCHead}
|
||||||
|
|
||||||
|
|
||||||
|
class RecModel(nn.Module):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__()
|
||||||
|
assert 'in_channels' in config, 'in_channels must in model config'
|
||||||
|
backbone_type = config.backbone.pop('type')
|
||||||
|
assert backbone_type in backbone_dict, f'backbone.type must in {backbone_dict}'
|
||||||
|
self.backbone = backbone_dict[backbone_type](config.in_channels, **config.backbone)
|
||||||
|
|
||||||
|
neck_type = config.neck.pop('type')
|
||||||
|
assert neck_type in neck_dict, f'neck.type must in {neck_dict}'
|
||||||
|
self.neck = neck_dict[neck_type](self.backbone.out_channels, **config.neck)
|
||||||
|
|
||||||
|
head_type = config.head.pop('type')
|
||||||
|
assert head_type in head_dict, f'head.type must in {head_dict}'
|
||||||
|
self.head = head_dict[head_type](self.neck.out_channels, **config.head)
|
||||||
|
|
||||||
|
self.name = f'RecModel_{backbone_type}_{neck_type}_{head_type}'
|
||||||
|
|
||||||
|
def load_3rd_state_dict(self, _3rd_name, _state):
|
||||||
|
self.backbone.load_3rd_state_dict(_3rd_name, _state)
|
||||||
|
self.neck.load_3rd_state_dict(_3rd_name, _state)
|
||||||
|
self.head.load_3rd_state_dict(_3rd_name, _state)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.backbone(x)
|
||||||
|
x = self.neck(x)
|
||||||
|
x = self.head(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def encode(self, x):
|
||||||
|
x = self.backbone(x)
|
||||||
|
x = self.neck(x)
|
||||||
|
x = self.head.ctc_encoder(x)
|
||||||
|
return x
|
@ -0,0 +1,233 @@
|
|||||||
|
import os, sys
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from .common import Activation
|
||||||
|
|
||||||
|
|
||||||
|
class ConvBNLayer(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
num_channels,
|
||||||
|
filter_size,
|
||||||
|
num_filters,
|
||||||
|
stride,
|
||||||
|
padding,
|
||||||
|
channels=None,
|
||||||
|
num_groups=1,
|
||||||
|
act='hard_swish'):
|
||||||
|
super(ConvBNLayer, self).__init__()
|
||||||
|
self.act = act
|
||||||
|
self._conv = nn.Conv2d(
|
||||||
|
in_channels=num_channels,
|
||||||
|
out_channels=num_filters,
|
||||||
|
kernel_size=filter_size,
|
||||||
|
stride=stride,
|
||||||
|
padding=padding,
|
||||||
|
groups=num_groups,
|
||||||
|
bias=False)
|
||||||
|
|
||||||
|
self._batch_norm = nn.BatchNorm2d(
|
||||||
|
num_filters,
|
||||||
|
)
|
||||||
|
if self.act is not None:
|
||||||
|
self._act = Activation(act_type=act, inplace=True)
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
y = self._conv(inputs)
|
||||||
|
y = self._batch_norm(y)
|
||||||
|
if self.act is not None:
|
||||||
|
y = self._act(y)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
class DepthwiseSeparable(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
num_channels,
|
||||||
|
num_filters1,
|
||||||
|
num_filters2,
|
||||||
|
num_groups,
|
||||||
|
stride,
|
||||||
|
scale,
|
||||||
|
dw_size=3,
|
||||||
|
padding=1,
|
||||||
|
use_se=False):
|
||||||
|
super(DepthwiseSeparable, self).__init__()
|
||||||
|
self.use_se = use_se
|
||||||
|
self._depthwise_conv = ConvBNLayer(
|
||||||
|
num_channels=num_channels,
|
||||||
|
num_filters=int(num_filters1 * scale),
|
||||||
|
filter_size=dw_size,
|
||||||
|
stride=stride,
|
||||||
|
padding=padding,
|
||||||
|
num_groups=int(num_groups * scale))
|
||||||
|
if use_se:
|
||||||
|
self._se = SEModule(int(num_filters1 * scale))
|
||||||
|
self._pointwise_conv = ConvBNLayer(
|
||||||
|
num_channels=int(num_filters1 * scale),
|
||||||
|
filter_size=1,
|
||||||
|
num_filters=int(num_filters2 * scale),
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
y = self._depthwise_conv(inputs)
|
||||||
|
if self.use_se:
|
||||||
|
y = self._se(y)
|
||||||
|
y = self._pointwise_conv(y)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
class MobileNetV1Enhance(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels=3,
|
||||||
|
scale=0.5,
|
||||||
|
last_conv_stride=1,
|
||||||
|
last_pool_type='max',
|
||||||
|
**kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.scale = scale
|
||||||
|
self.block_list = []
|
||||||
|
|
||||||
|
self.conv1 = ConvBNLayer(
|
||||||
|
num_channels=in_channels,
|
||||||
|
filter_size=3,
|
||||||
|
channels=3,
|
||||||
|
num_filters=int(32 * scale),
|
||||||
|
stride=2,
|
||||||
|
padding=1)
|
||||||
|
|
||||||
|
conv2_1 = DepthwiseSeparable(
|
||||||
|
num_channels=int(32 * scale),
|
||||||
|
num_filters1=32,
|
||||||
|
num_filters2=64,
|
||||||
|
num_groups=32,
|
||||||
|
stride=1,
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv2_1)
|
||||||
|
|
||||||
|
conv2_2 = DepthwiseSeparable(
|
||||||
|
num_channels=int(64 * scale),
|
||||||
|
num_filters1=64,
|
||||||
|
num_filters2=128,
|
||||||
|
num_groups=64,
|
||||||
|
stride=1,
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv2_2)
|
||||||
|
|
||||||
|
conv3_1 = DepthwiseSeparable(
|
||||||
|
num_channels=int(128 * scale),
|
||||||
|
num_filters1=128,
|
||||||
|
num_filters2=128,
|
||||||
|
num_groups=128,
|
||||||
|
stride=1,
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv3_1)
|
||||||
|
|
||||||
|
conv3_2 = DepthwiseSeparable(
|
||||||
|
num_channels=int(128 * scale),
|
||||||
|
num_filters1=128,
|
||||||
|
num_filters2=256,
|
||||||
|
num_groups=128,
|
||||||
|
stride=(2, 1),
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv3_2)
|
||||||
|
|
||||||
|
conv4_1 = DepthwiseSeparable(
|
||||||
|
num_channels=int(256 * scale),
|
||||||
|
num_filters1=256,
|
||||||
|
num_filters2=256,
|
||||||
|
num_groups=256,
|
||||||
|
stride=1,
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv4_1)
|
||||||
|
|
||||||
|
conv4_2 = DepthwiseSeparable(
|
||||||
|
num_channels=int(256 * scale),
|
||||||
|
num_filters1=256,
|
||||||
|
num_filters2=512,
|
||||||
|
num_groups=256,
|
||||||
|
stride=(2, 1),
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv4_2)
|
||||||
|
|
||||||
|
for _ in range(5):
|
||||||
|
conv5 = DepthwiseSeparable(
|
||||||
|
num_channels=int(512 * scale),
|
||||||
|
num_filters1=512,
|
||||||
|
num_filters2=512,
|
||||||
|
num_groups=512,
|
||||||
|
stride=1,
|
||||||
|
dw_size=5,
|
||||||
|
padding=2,
|
||||||
|
scale=scale,
|
||||||
|
use_se=False)
|
||||||
|
self.block_list.append(conv5)
|
||||||
|
|
||||||
|
conv5_6 = DepthwiseSeparable(
|
||||||
|
num_channels=int(512 * scale),
|
||||||
|
num_filters1=512,
|
||||||
|
num_filters2=1024,
|
||||||
|
num_groups=512,
|
||||||
|
stride=(2, 1),
|
||||||
|
dw_size=5,
|
||||||
|
padding=2,
|
||||||
|
scale=scale,
|
||||||
|
use_se=True)
|
||||||
|
self.block_list.append(conv5_6)
|
||||||
|
|
||||||
|
conv6 = DepthwiseSeparable(
|
||||||
|
num_channels=int(1024 * scale),
|
||||||
|
num_filters1=1024,
|
||||||
|
num_filters2=1024,
|
||||||
|
num_groups=1024,
|
||||||
|
stride=last_conv_stride,
|
||||||
|
dw_size=5,
|
||||||
|
padding=2,
|
||||||
|
use_se=True,
|
||||||
|
scale=scale)
|
||||||
|
self.block_list.append(conv6)
|
||||||
|
|
||||||
|
self.block_list = nn.Sequential(*self.block_list)
|
||||||
|
if last_pool_type == 'avg':
|
||||||
|
self.pool = nn.AvgPool2d(kernel_size=2, stride=2, padding=0)
|
||||||
|
else:
|
||||||
|
self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
|
||||||
|
self.out_channels = int(1024 * scale)
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
y = self.conv1(inputs)
|
||||||
|
y = self.block_list(y)
|
||||||
|
y = self.pool(y)
|
||||||
|
return y
|
||||||
|
|
||||||
|
def hardsigmoid(x):
|
||||||
|
return F.relu6(x + 3., inplace=True) / 6.
|
||||||
|
|
||||||
|
class SEModule(nn.Module):
|
||||||
|
def __init__(self, channel, reduction=4):
|
||||||
|
super(SEModule, self).__init__()
|
||||||
|
self.avg_pool = nn.AdaptiveAvgPool2d(1)
|
||||||
|
self.conv1 = nn.Conv2d(
|
||||||
|
in_channels=channel,
|
||||||
|
out_channels=channel // reduction,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias=True)
|
||||||
|
self.conv2 = nn.Conv2d(
|
||||||
|
in_channels=channel // reduction,
|
||||||
|
out_channels=channel,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias=True)
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
outputs = self.avg_pool(inputs)
|
||||||
|
outputs = self.conv1(outputs)
|
||||||
|
outputs = F.relu(outputs)
|
||||||
|
outputs = self.conv2(outputs)
|
||||||
|
outputs = hardsigmoid(outputs)
|
||||||
|
x = torch.mul(inputs, outputs)
|
||||||
|
|
||||||
|
return x
|
@ -0,0 +1,591 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import numpy as np
|
||||||
|
from torch.nn.init import trunc_normal_, zeros_, ones_
|
||||||
|
from torch.nn import functional
|
||||||
|
|
||||||
|
|
||||||
|
def drop_path(x, drop_prob=0., training=False):
|
||||||
|
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||||
|
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
|
||||||
|
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
|
||||||
|
"""
|
||||||
|
if drop_prob == 0. or not training:
|
||||||
|
return x
|
||||||
|
keep_prob = torch.tensor(1 - drop_prob)
|
||||||
|
shape = (x.size()[0], ) + (1, ) * (x.ndim - 1)
|
||||||
|
random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype)
|
||||||
|
random_tensor = torch.floor(random_tensor) # binarize
|
||||||
|
output = x.divide(keep_prob) * random_tensor
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class Swish(nn.Module):
|
||||||
|
def __int__(self):
|
||||||
|
super(Swish, self).__int__()
|
||||||
|
|
||||||
|
def forward(self,x):
|
||||||
|
return x*torch.sigmoid(x)
|
||||||
|
|
||||||
|
|
||||||
|
class ConvBNLayer(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias_attr=False,
|
||||||
|
groups=1,
|
||||||
|
act=nn.GELU):
|
||||||
|
super().__init__()
|
||||||
|
self.conv = nn.Conv2d(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
kernel_size=kernel_size,
|
||||||
|
stride=stride,
|
||||||
|
padding=padding,
|
||||||
|
groups=groups,
|
||||||
|
# weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()),
|
||||||
|
bias=bias_attr)
|
||||||
|
self.norm = nn.BatchNorm2d(out_channels)
|
||||||
|
self.act = act()
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
out = self.conv(inputs)
|
||||||
|
out = self.norm(out)
|
||||||
|
out = self.act(out)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class DropPath(nn.Module):
|
||||||
|
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, drop_prob=None):
|
||||||
|
super(DropPath, self).__init__()
|
||||||
|
self.drop_prob = drop_prob
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return drop_path(x, self.drop_prob, self.training)
|
||||||
|
|
||||||
|
|
||||||
|
class Identity(nn.Module):
|
||||||
|
def __init__(self):
|
||||||
|
super(Identity, self).__init__()
|
||||||
|
|
||||||
|
def forward(self, input):
|
||||||
|
return input
|
||||||
|
|
||||||
|
|
||||||
|
class Mlp(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_features,
|
||||||
|
hidden_features=None,
|
||||||
|
out_features=None,
|
||||||
|
act_layer=nn.GELU,
|
||||||
|
drop=0.):
|
||||||
|
super().__init__()
|
||||||
|
out_features = out_features or in_features
|
||||||
|
hidden_features = hidden_features or in_features
|
||||||
|
self.fc1 = nn.Linear(in_features, hidden_features)
|
||||||
|
if isinstance(act_layer, str):
|
||||||
|
self.act = Swish()
|
||||||
|
else:
|
||||||
|
self.act = act_layer()
|
||||||
|
self.fc2 = nn.Linear(hidden_features, out_features)
|
||||||
|
self.drop = nn.Dropout(drop)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.fc1(x)
|
||||||
|
x = self.act(x)
|
||||||
|
x = self.drop(x)
|
||||||
|
x = self.fc2(x)
|
||||||
|
x = self.drop(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class ConvMixer(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
dim,
|
||||||
|
num_heads=8,
|
||||||
|
HW=(8, 25),
|
||||||
|
local_k=(3, 3), ):
|
||||||
|
super().__init__()
|
||||||
|
self.HW = HW
|
||||||
|
self.dim = dim
|
||||||
|
self.local_mixer = nn.Conv2d(
|
||||||
|
dim,
|
||||||
|
dim,
|
||||||
|
local_k,
|
||||||
|
1, (local_k[0] // 2, local_k[1] // 2),
|
||||||
|
groups=num_heads,
|
||||||
|
# weight_attr=ParamAttr(initializer=KaimingNormal())
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
h = self.HW[0]
|
||||||
|
w = self.HW[1]
|
||||||
|
x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w])
|
||||||
|
x = self.local_mixer(x)
|
||||||
|
x = x.flatten(2).transpose([0, 2, 1])
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class Attention(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
dim,
|
||||||
|
num_heads=8,
|
||||||
|
mixer='Global',
|
||||||
|
HW=(8, 25),
|
||||||
|
local_k=(7, 11),
|
||||||
|
qkv_bias=False,
|
||||||
|
qk_scale=None,
|
||||||
|
attn_drop=0.,
|
||||||
|
proj_drop=0.):
|
||||||
|
super().__init__()
|
||||||
|
self.num_heads = num_heads
|
||||||
|
head_dim = dim // num_heads
|
||||||
|
self.scale = qk_scale or head_dim**-0.5
|
||||||
|
|
||||||
|
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
||||||
|
self.attn_drop = nn.Dropout(attn_drop)
|
||||||
|
self.proj = nn.Linear(dim, dim)
|
||||||
|
self.proj_drop = nn.Dropout(proj_drop)
|
||||||
|
self.HW = HW
|
||||||
|
if HW is not None:
|
||||||
|
H = HW[0]
|
||||||
|
W = HW[1]
|
||||||
|
self.N = H * W
|
||||||
|
self.C = dim
|
||||||
|
if mixer == 'Local' and HW is not None:
|
||||||
|
hk = local_k[0]
|
||||||
|
wk = local_k[1]
|
||||||
|
mask = torch.ones([H * W, H + hk - 1, W + wk - 1])
|
||||||
|
for h in range(0, H):
|
||||||
|
for w in range(0, W):
|
||||||
|
mask[h * W + w, h:h + hk, w:w + wk] = 0.
|
||||||
|
mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk //
|
||||||
|
2].flatten(1)
|
||||||
|
mask_inf = torch.full([H * W, H * W],fill_value=float('-inf'))
|
||||||
|
mask = torch.where(mask_paddle < 1, mask_paddle, mask_inf)
|
||||||
|
self.mask = mask[None,None,:]
|
||||||
|
# self.mask = mask.unsqueeze([0, 1])
|
||||||
|
self.mixer = mixer
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.HW is not None:
|
||||||
|
N = self.N
|
||||||
|
C = self.C
|
||||||
|
else:
|
||||||
|
_, N, C = x.shape
|
||||||
|
qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //self.num_heads)).permute((2, 0, 3, 1, 4))
|
||||||
|
q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
|
||||||
|
|
||||||
|
attn = (q.matmul(k.permute((0, 1, 3, 2))))
|
||||||
|
if self.mixer == 'Local':
|
||||||
|
attn += self.mask
|
||||||
|
attn = functional.softmax(attn, dim=-1)
|
||||||
|
attn = self.attn_drop(attn)
|
||||||
|
|
||||||
|
x = (attn.matmul(v)).permute((0, 2, 1, 3)).reshape((-1, N, C))
|
||||||
|
x = self.proj(x)
|
||||||
|
x = self.proj_drop(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class Block(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
dim,
|
||||||
|
num_heads,
|
||||||
|
mixer='Global',
|
||||||
|
local_mixer=(7, 11),
|
||||||
|
HW=(8, 25),
|
||||||
|
mlp_ratio=4.,
|
||||||
|
qkv_bias=False,
|
||||||
|
qk_scale=None,
|
||||||
|
drop=0.,
|
||||||
|
attn_drop=0.,
|
||||||
|
drop_path=0.,
|
||||||
|
act_layer=nn.GELU,
|
||||||
|
norm_layer='nn.LayerNorm',
|
||||||
|
epsilon=1e-6,
|
||||||
|
prenorm=True):
|
||||||
|
super().__init__()
|
||||||
|
if isinstance(norm_layer, str):
|
||||||
|
self.norm1 = eval(norm_layer)(dim, eps=epsilon)
|
||||||
|
else:
|
||||||
|
self.norm1 = norm_layer(dim)
|
||||||
|
if mixer == 'Global' or mixer == 'Local':
|
||||||
|
|
||||||
|
self.mixer = Attention(
|
||||||
|
dim,
|
||||||
|
num_heads=num_heads,
|
||||||
|
mixer=mixer,
|
||||||
|
HW=HW,
|
||||||
|
local_k=local_mixer,
|
||||||
|
qkv_bias=qkv_bias,
|
||||||
|
qk_scale=qk_scale,
|
||||||
|
attn_drop=attn_drop,
|
||||||
|
proj_drop=drop)
|
||||||
|
elif mixer == 'Conv':
|
||||||
|
self.mixer = ConvMixer(
|
||||||
|
dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
|
||||||
|
else:
|
||||||
|
raise TypeError("The mixer must be one of [Global, Local, Conv]")
|
||||||
|
|
||||||
|
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
|
||||||
|
if isinstance(norm_layer, str):
|
||||||
|
self.norm2 = eval(norm_layer)(dim, eps=epsilon)
|
||||||
|
else:
|
||||||
|
self.norm2 = norm_layer(dim)
|
||||||
|
mlp_hidden_dim = int(dim * mlp_ratio)
|
||||||
|
self.mlp_ratio = mlp_ratio
|
||||||
|
self.mlp = Mlp(in_features=dim,
|
||||||
|
hidden_features=mlp_hidden_dim,
|
||||||
|
act_layer=act_layer,
|
||||||
|
drop=drop)
|
||||||
|
self.prenorm = prenorm
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.prenorm:
|
||||||
|
x = self.norm1(x + self.drop_path(self.mixer(x)))
|
||||||
|
x = self.norm2(x + self.drop_path(self.mlp(x)))
|
||||||
|
else:
|
||||||
|
x = x + self.drop_path(self.mixer(self.norm1(x)))
|
||||||
|
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class PatchEmbed(nn.Module):
|
||||||
|
""" Image to Patch Embedding
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
img_size=(32, 100),
|
||||||
|
in_channels=3,
|
||||||
|
embed_dim=768,
|
||||||
|
sub_num=2):
|
||||||
|
super().__init__()
|
||||||
|
num_patches = (img_size[1] // (2 ** sub_num)) * \
|
||||||
|
(img_size[0] // (2 ** sub_num))
|
||||||
|
self.img_size = img_size
|
||||||
|
self.num_patches = num_patches
|
||||||
|
self.embed_dim = embed_dim
|
||||||
|
self.norm = None
|
||||||
|
if sub_num == 2:
|
||||||
|
self.proj = nn.Sequential(
|
||||||
|
ConvBNLayer(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=embed_dim // 2,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
act=nn.GELU,
|
||||||
|
bias_attr=False),
|
||||||
|
ConvBNLayer(
|
||||||
|
in_channels=embed_dim // 2,
|
||||||
|
out_channels=embed_dim,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
act=nn.GELU,
|
||||||
|
bias_attr=False))
|
||||||
|
if sub_num == 3:
|
||||||
|
self.proj = nn.Sequential(
|
||||||
|
ConvBNLayer(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=embed_dim // 4,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
act=nn.GELU,
|
||||||
|
bias_attr=False),
|
||||||
|
ConvBNLayer(
|
||||||
|
in_channels=embed_dim // 4,
|
||||||
|
out_channels=embed_dim // 2,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
act=nn.GELU,
|
||||||
|
bias_attr=False),
|
||||||
|
ConvBNLayer(
|
||||||
|
in_channels=embed_dim // 2,
|
||||||
|
out_channels=embed_dim,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
act=nn.GELU,
|
||||||
|
bias_attr=False))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
B, C, H, W = x.shape
|
||||||
|
assert H == self.img_size[0] and W == self.img_size[1], \
|
||||||
|
f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
|
||||||
|
x = self.proj(x).flatten(2).permute(0, 2, 1)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class SubSample(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
types='Pool',
|
||||||
|
stride=(2, 1),
|
||||||
|
sub_norm='nn.LayerNorm',
|
||||||
|
act=None):
|
||||||
|
super().__init__()
|
||||||
|
self.types = types
|
||||||
|
if types == 'Pool':
|
||||||
|
self.avgpool = nn.AvgPool2d(
|
||||||
|
kernel_size=(3, 5), stride=stride, padding=(1, 2))
|
||||||
|
self.maxpool = nn.MaxPool2d(
|
||||||
|
kernel_size=(3, 5), stride=stride, padding=(1, 2))
|
||||||
|
self.proj = nn.Linear(in_channels, out_channels)
|
||||||
|
else:
|
||||||
|
self.conv = nn.Conv2d(
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=stride,
|
||||||
|
padding=1,
|
||||||
|
# weight_attr=ParamAttr(initializer=KaimingNormal())
|
||||||
|
)
|
||||||
|
self.norm = eval(sub_norm)(out_channels)
|
||||||
|
if act is not None:
|
||||||
|
self.act = act()
|
||||||
|
else:
|
||||||
|
self.act = None
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
|
||||||
|
if self.types == 'Pool':
|
||||||
|
x1 = self.avgpool(x)
|
||||||
|
x2 = self.maxpool(x)
|
||||||
|
x = (x1 + x2) * 0.5
|
||||||
|
out = self.proj(x.flatten(2).permute((0, 2, 1)))
|
||||||
|
else:
|
||||||
|
x = self.conv(x)
|
||||||
|
out = x.flatten(2).permute((0, 2, 1))
|
||||||
|
out = self.norm(out)
|
||||||
|
if self.act is not None:
|
||||||
|
out = self.act(out)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class SVTRNet(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
img_size=[48, 100],
|
||||||
|
in_channels=3,
|
||||||
|
embed_dim=[64, 128, 256],
|
||||||
|
depth=[3, 6, 3],
|
||||||
|
num_heads=[2, 4, 8],
|
||||||
|
mixer=['Local'] * 6 + ['Global'] *
|
||||||
|
6, # Local atten, Global atten, Conv
|
||||||
|
local_mixer=[[7, 11], [7, 11], [7, 11]],
|
||||||
|
patch_merging='Conv', # Conv, Pool, None
|
||||||
|
mlp_ratio=4,
|
||||||
|
qkv_bias=True,
|
||||||
|
qk_scale=None,
|
||||||
|
drop_rate=0.,
|
||||||
|
last_drop=0.1,
|
||||||
|
attn_drop_rate=0.,
|
||||||
|
drop_path_rate=0.1,
|
||||||
|
norm_layer='nn.LayerNorm',
|
||||||
|
sub_norm='nn.LayerNorm',
|
||||||
|
epsilon=1e-6,
|
||||||
|
out_channels=192,
|
||||||
|
out_char_num=25,
|
||||||
|
block_unit='Block',
|
||||||
|
act='nn.GELU',
|
||||||
|
last_stage=True,
|
||||||
|
sub_num=2,
|
||||||
|
prenorm=True,
|
||||||
|
use_lenhead=False,
|
||||||
|
**kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.img_size = img_size
|
||||||
|
self.embed_dim = embed_dim
|
||||||
|
self.out_channels = out_channels
|
||||||
|
self.prenorm = prenorm
|
||||||
|
patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging
|
||||||
|
self.patch_embed = PatchEmbed(
|
||||||
|
img_size=img_size,
|
||||||
|
in_channels=in_channels,
|
||||||
|
embed_dim=embed_dim[0],
|
||||||
|
sub_num=sub_num)
|
||||||
|
num_patches = self.patch_embed.num_patches
|
||||||
|
self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)]
|
||||||
|
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim[0]))
|
||||||
|
# self.pos_embed = self.create_parameter(
|
||||||
|
# shape=[1, num_patches, embed_dim[0]], default_initializer=zeros_)
|
||||||
|
|
||||||
|
# self.add_parameter("pos_embed", self.pos_embed)
|
||||||
|
|
||||||
|
self.pos_drop = nn.Dropout(p=drop_rate)
|
||||||
|
Block_unit = eval(block_unit)
|
||||||
|
|
||||||
|
dpr = np.linspace(0, drop_path_rate, sum(depth))
|
||||||
|
self.blocks1 = nn.ModuleList(
|
||||||
|
[
|
||||||
|
Block_unit(
|
||||||
|
dim=embed_dim[0],
|
||||||
|
num_heads=num_heads[0],
|
||||||
|
mixer=mixer[0:depth[0]][i],
|
||||||
|
HW=self.HW,
|
||||||
|
local_mixer=local_mixer[0],
|
||||||
|
mlp_ratio=mlp_ratio,
|
||||||
|
qkv_bias=qkv_bias,
|
||||||
|
qk_scale=qk_scale,
|
||||||
|
drop=drop_rate,
|
||||||
|
act_layer=eval(act),
|
||||||
|
attn_drop=attn_drop_rate,
|
||||||
|
drop_path=dpr[0:depth[0]][i],
|
||||||
|
norm_layer=norm_layer,
|
||||||
|
epsilon=epsilon,
|
||||||
|
prenorm=prenorm) for i in range(depth[0])
|
||||||
|
]
|
||||||
|
)
|
||||||
|
if patch_merging is not None:
|
||||||
|
self.sub_sample1 = SubSample(
|
||||||
|
embed_dim[0],
|
||||||
|
embed_dim[1],
|
||||||
|
sub_norm=sub_norm,
|
||||||
|
stride=[2, 1],
|
||||||
|
types=patch_merging)
|
||||||
|
HW = [self.HW[0] // 2, self.HW[1]]
|
||||||
|
else:
|
||||||
|
HW = self.HW
|
||||||
|
self.patch_merging = patch_merging
|
||||||
|
self.blocks2 = nn.ModuleList([
|
||||||
|
Block_unit(
|
||||||
|
dim=embed_dim[1],
|
||||||
|
num_heads=num_heads[1],
|
||||||
|
mixer=mixer[depth[0]:depth[0] + depth[1]][i],
|
||||||
|
HW=HW,
|
||||||
|
local_mixer=local_mixer[1],
|
||||||
|
mlp_ratio=mlp_ratio,
|
||||||
|
qkv_bias=qkv_bias,
|
||||||
|
qk_scale=qk_scale,
|
||||||
|
drop=drop_rate,
|
||||||
|
act_layer=eval(act),
|
||||||
|
attn_drop=attn_drop_rate,
|
||||||
|
drop_path=dpr[depth[0]:depth[0] + depth[1]][i],
|
||||||
|
norm_layer=norm_layer,
|
||||||
|
epsilon=epsilon,
|
||||||
|
prenorm=prenorm) for i in range(depth[1])
|
||||||
|
])
|
||||||
|
if patch_merging is not None:
|
||||||
|
self.sub_sample2 = SubSample(
|
||||||
|
embed_dim[1],
|
||||||
|
embed_dim[2],
|
||||||
|
sub_norm=sub_norm,
|
||||||
|
stride=[2, 1],
|
||||||
|
types=patch_merging)
|
||||||
|
HW = [self.HW[0] // 4, self.HW[1]]
|
||||||
|
else:
|
||||||
|
HW = self.HW
|
||||||
|
self.blocks3 = nn.ModuleList([
|
||||||
|
Block_unit(
|
||||||
|
dim=embed_dim[2],
|
||||||
|
num_heads=num_heads[2],
|
||||||
|
mixer=mixer[depth[0] + depth[1]:][i],
|
||||||
|
HW=HW,
|
||||||
|
local_mixer=local_mixer[2],
|
||||||
|
mlp_ratio=mlp_ratio,
|
||||||
|
qkv_bias=qkv_bias,
|
||||||
|
qk_scale=qk_scale,
|
||||||
|
drop=drop_rate,
|
||||||
|
act_layer=eval(act),
|
||||||
|
attn_drop=attn_drop_rate,
|
||||||
|
drop_path=dpr[depth[0] + depth[1]:][i],
|
||||||
|
norm_layer=norm_layer,
|
||||||
|
epsilon=epsilon,
|
||||||
|
prenorm=prenorm) for i in range(depth[2])
|
||||||
|
])
|
||||||
|
self.last_stage = last_stage
|
||||||
|
if last_stage:
|
||||||
|
self.avg_pool = nn.AdaptiveAvgPool2d((1, out_char_num))
|
||||||
|
self.last_conv = nn.Conv2d(
|
||||||
|
in_channels=embed_dim[2],
|
||||||
|
out_channels=self.out_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias=False)
|
||||||
|
self.hardswish = nn.Hardswish()
|
||||||
|
self.dropout = nn.Dropout(p=last_drop)
|
||||||
|
if not prenorm:
|
||||||
|
self.norm = eval(norm_layer)(embed_dim[-1], epsilon=epsilon)
|
||||||
|
self.use_lenhead = use_lenhead
|
||||||
|
if use_lenhead:
|
||||||
|
self.len_conv = nn.Linear(embed_dim[2], self.out_channels)
|
||||||
|
self.hardswish_len = nn.Hardswish()
|
||||||
|
self.dropout_len = nn.Dropout(
|
||||||
|
p=last_drop)
|
||||||
|
|
||||||
|
trunc_normal_(self.pos_embed,std=.02)
|
||||||
|
self.apply(self._init_weights)
|
||||||
|
|
||||||
|
def _init_weights(self, m):
|
||||||
|
if isinstance(m, nn.Linear):
|
||||||
|
trunc_normal_(m.weight,std=.02)
|
||||||
|
if isinstance(m, nn.Linear) and m.bias is not None:
|
||||||
|
zeros_(m.bias)
|
||||||
|
elif isinstance(m, nn.LayerNorm):
|
||||||
|
zeros_(m.bias)
|
||||||
|
ones_(m.weight)
|
||||||
|
|
||||||
|
def forward_features(self, x):
|
||||||
|
x = self.patch_embed(x)
|
||||||
|
x = x + self.pos_embed
|
||||||
|
x = self.pos_drop(x)
|
||||||
|
for blk in self.blocks1:
|
||||||
|
x = blk(x)
|
||||||
|
if self.patch_merging is not None:
|
||||||
|
x = self.sub_sample1(
|
||||||
|
x.permute([0, 2, 1]).reshape(
|
||||||
|
[-1, self.embed_dim[0], self.HW[0], self.HW[1]]))
|
||||||
|
for blk in self.blocks2:
|
||||||
|
x = blk(x)
|
||||||
|
if self.patch_merging is not None:
|
||||||
|
x = self.sub_sample2(
|
||||||
|
x.permute([0, 2, 1]).reshape(
|
||||||
|
[-1, self.embed_dim[1], self.HW[0] // 2, self.HW[1]]))
|
||||||
|
for blk in self.blocks3:
|
||||||
|
x = blk(x)
|
||||||
|
if not self.prenorm:
|
||||||
|
x = self.norm(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.forward_features(x)
|
||||||
|
if self.use_lenhead:
|
||||||
|
len_x = self.len_conv(x.mean(1))
|
||||||
|
len_x = self.dropout_len(self.hardswish_len(len_x))
|
||||||
|
if self.last_stage:
|
||||||
|
if self.patch_merging is not None:
|
||||||
|
h = self.HW[0] // 4
|
||||||
|
else:
|
||||||
|
h = self.HW[0]
|
||||||
|
x = self.avg_pool(
|
||||||
|
x.permute([0, 2, 1]).reshape(
|
||||||
|
[-1, self.embed_dim[2], h, self.HW[1]]))
|
||||||
|
x = self.last_conv(x)
|
||||||
|
x = self.hardswish(x)
|
||||||
|
x = self.dropout(x)
|
||||||
|
if self.use_lenhead:
|
||||||
|
return x, len_x
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
if __name__=="__main__":
|
||||||
|
a = torch.rand(1,3,48,100)
|
||||||
|
svtr = SVTRNet()
|
||||||
|
|
||||||
|
out = svtr(a)
|
||||||
|
print(svtr)
|
||||||
|
print(out.size())
|
@ -0,0 +1,74 @@
|
|||||||
|
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
class Hswish(nn.Module):
|
||||||
|
def __init__(self, inplace=True):
|
||||||
|
super(Hswish, self).__init__()
|
||||||
|
self.inplace = inplace
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return x * F.relu6(x + 3., inplace=self.inplace) / 6.
|
||||||
|
|
||||||
|
# out = max(0, min(1, slop*x+offset))
|
||||||
|
# paddle.fluid.layers.hard_sigmoid(x, slope=0.2, offset=0.5, name=None)
|
||||||
|
class Hsigmoid(nn.Module):
|
||||||
|
def __init__(self, inplace=True):
|
||||||
|
super(Hsigmoid, self).__init__()
|
||||||
|
self.inplace = inplace
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# torch: F.relu6(x + 3., inplace=self.inplace) / 6.
|
||||||
|
# paddle: F.relu6(1.2 * x + 3., inplace=self.inplace) / 6.
|
||||||
|
return F.relu6(1.2 * x + 3., inplace=self.inplace) / 6.
|
||||||
|
|
||||||
|
class GELU(nn.Module):
|
||||||
|
def __init__(self, inplace=True):
|
||||||
|
super(GELU, self).__init__()
|
||||||
|
self.inplace = inplace
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return torch.nn.functional.gelu(x)
|
||||||
|
|
||||||
|
|
||||||
|
class Swish(nn.Module):
|
||||||
|
def __init__(self, inplace=True):
|
||||||
|
super(Swish, self).__init__()
|
||||||
|
self.inplace = inplace
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.inplace:
|
||||||
|
x.mul_(torch.sigmoid(x))
|
||||||
|
return x
|
||||||
|
else:
|
||||||
|
return x*torch.sigmoid(x)
|
||||||
|
|
||||||
|
|
||||||
|
class Activation(nn.Module):
|
||||||
|
def __init__(self, act_type, inplace=True):
|
||||||
|
super(Activation, self).__init__()
|
||||||
|
act_type = act_type.lower()
|
||||||
|
if act_type == 'relu':
|
||||||
|
self.act = nn.ReLU(inplace=inplace)
|
||||||
|
elif act_type == 'relu6':
|
||||||
|
self.act = nn.ReLU6(inplace=inplace)
|
||||||
|
elif act_type == 'sigmoid':
|
||||||
|
raise NotImplementedError
|
||||||
|
elif act_type == 'hard_sigmoid':
|
||||||
|
self.act = Hsigmoid(inplace)
|
||||||
|
elif act_type == 'hard_swish':
|
||||||
|
self.act = Hswish(inplace=inplace)
|
||||||
|
elif act_type == 'leakyrelu':
|
||||||
|
self.act = nn.LeakyReLU(inplace=inplace)
|
||||||
|
elif act_type == 'gelu':
|
||||||
|
self.act = GELU(inplace=inplace)
|
||||||
|
elif act_type == 'swish':
|
||||||
|
self.act = Swish(inplace=inplace)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
return self.act(inputs)
|
@ -0,0 +1,95 @@
|
|||||||
|
0
|
||||||
|
1
|
||||||
|
2
|
||||||
|
3
|
||||||
|
4
|
||||||
|
5
|
||||||
|
6
|
||||||
|
7
|
||||||
|
8
|
||||||
|
9
|
||||||
|
:
|
||||||
|
;
|
||||||
|
<
|
||||||
|
=
|
||||||
|
>
|
||||||
|
?
|
||||||
|
@
|
||||||
|
A
|
||||||
|
B
|
||||||
|
C
|
||||||
|
D
|
||||||
|
E
|
||||||
|
F
|
||||||
|
G
|
||||||
|
H
|
||||||
|
I
|
||||||
|
J
|
||||||
|
K
|
||||||
|
L
|
||||||
|
M
|
||||||
|
N
|
||||||
|
O
|
||||||
|
P
|
||||||
|
Q
|
||||||
|
R
|
||||||
|
S
|
||||||
|
T
|
||||||
|
U
|
||||||
|
V
|
||||||
|
W
|
||||||
|
X
|
||||||
|
Y
|
||||||
|
Z
|
||||||
|
[
|
||||||
|
\
|
||||||
|
]
|
||||||
|
^
|
||||||
|
_
|
||||||
|
`
|
||||||
|
a
|
||||||
|
b
|
||||||
|
c
|
||||||
|
d
|
||||||
|
e
|
||||||
|
f
|
||||||
|
g
|
||||||
|
h
|
||||||
|
i
|
||||||
|
j
|
||||||
|
k
|
||||||
|
l
|
||||||
|
m
|
||||||
|
n
|
||||||
|
o
|
||||||
|
p
|
||||||
|
q
|
||||||
|
r
|
||||||
|
s
|
||||||
|
t
|
||||||
|
u
|
||||||
|
v
|
||||||
|
w
|
||||||
|
x
|
||||||
|
y
|
||||||
|
z
|
||||||
|
{
|
||||||
|
|
|
||||||
|
}
|
||||||
|
~
|
||||||
|
!
|
||||||
|
"
|
||||||
|
#
|
||||||
|
$
|
||||||
|
%
|
||||||
|
&
|
||||||
|
'
|
||||||
|
(
|
||||||
|
)
|
||||||
|
*
|
||||||
|
+
|
||||||
|
,
|
||||||
|
-
|
||||||
|
.
|
||||||
|
/
|
||||||
|
|
243
AnyText/AnyText_scripts/ldm/models/diffusion/plms.py
Normal file
@ -0,0 +1,243 @@
|
|||||||
|
"""SAMPLING ONLY."""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from ...modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
|
||||||
|
from ...models.diffusion.sampling_util import norm_thresholding
|
||||||
|
|
||||||
|
|
||||||
|
class PLMSSampler(object):
|
||||||
|
def __init__(self, model, schedule="linear", **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.model = model
|
||||||
|
self.ddpm_num_timesteps = model.num_timesteps
|
||||||
|
self.schedule = schedule
|
||||||
|
|
||||||
|
def register_buffer(self, name, attr):
|
||||||
|
if type(attr) == torch.Tensor:
|
||||||
|
if attr.device != torch.device("cuda"):
|
||||||
|
attr = attr.to(torch.device("cuda"))
|
||||||
|
setattr(self, name, attr)
|
||||||
|
|
||||||
|
def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
|
||||||
|
if ddim_eta != 0:
|
||||||
|
raise ValueError('ddim_eta must be 0 for PLMS')
|
||||||
|
self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
|
||||||
|
num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
|
||||||
|
alphas_cumprod = self.model.alphas_cumprod
|
||||||
|
assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
|
||||||
|
to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
|
||||||
|
|
||||||
|
self.register_buffer('betas', to_torch(self.model.betas))
|
||||||
|
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
|
||||||
|
self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
|
||||||
|
|
||||||
|
# calculations for diffusion q(x_t | x_{t-1}) and others
|
||||||
|
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
|
||||||
|
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
|
||||||
|
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
|
||||||
|
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
|
||||||
|
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
|
||||||
|
|
||||||
|
# ddim sampling parameters
|
||||||
|
ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
|
||||||
|
ddim_timesteps=self.ddim_timesteps,
|
||||||
|
eta=ddim_eta,verbose=verbose)
|
||||||
|
self.register_buffer('ddim_sigmas', ddim_sigmas)
|
||||||
|
self.register_buffer('ddim_alphas', ddim_alphas)
|
||||||
|
self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
|
||||||
|
self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
|
||||||
|
sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
|
||||||
|
(1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
|
||||||
|
1 - self.alphas_cumprod / self.alphas_cumprod_prev))
|
||||||
|
self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def sample(self,
|
||||||
|
S,
|
||||||
|
batch_size,
|
||||||
|
shape,
|
||||||
|
conditioning=None,
|
||||||
|
callback=None,
|
||||||
|
normals_sequence=None,
|
||||||
|
img_callback=None,
|
||||||
|
quantize_x0=False,
|
||||||
|
eta=0.,
|
||||||
|
mask=None,
|
||||||
|
x0=None,
|
||||||
|
temperature=1.,
|
||||||
|
noise_dropout=0.,
|
||||||
|
score_corrector=None,
|
||||||
|
corrector_kwargs=None,
|
||||||
|
verbose=True,
|
||||||
|
x_T=None,
|
||||||
|
log_every_t=100,
|
||||||
|
unconditional_guidance_scale=1.,
|
||||||
|
unconditional_conditioning=None,
|
||||||
|
# this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
|
||||||
|
dynamic_threshold=None,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
if conditioning is not None:
|
||||||
|
if isinstance(conditioning, dict):
|
||||||
|
cbs = conditioning[list(conditioning.keys())[0]].shape[0]
|
||||||
|
if cbs != batch_size:
|
||||||
|
print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
|
||||||
|
else:
|
||||||
|
if conditioning.shape[0] != batch_size:
|
||||||
|
print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
|
||||||
|
|
||||||
|
self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
|
||||||
|
# sampling
|
||||||
|
C, H, W = shape
|
||||||
|
size = (batch_size, C, H, W)
|
||||||
|
print(f'Data shape for PLMS sampling is {size}')
|
||||||
|
|
||||||
|
samples, intermediates = self.plms_sampling(conditioning, size,
|
||||||
|
callback=callback,
|
||||||
|
img_callback=img_callback,
|
||||||
|
quantize_denoised=quantize_x0,
|
||||||
|
mask=mask, x0=x0,
|
||||||
|
ddim_use_original_steps=False,
|
||||||
|
noise_dropout=noise_dropout,
|
||||||
|
temperature=temperature,
|
||||||
|
score_corrector=score_corrector,
|
||||||
|
corrector_kwargs=corrector_kwargs,
|
||||||
|
x_T=x_T,
|
||||||
|
log_every_t=log_every_t,
|
||||||
|
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||||
|
unconditional_conditioning=unconditional_conditioning,
|
||||||
|
dynamic_threshold=dynamic_threshold,
|
||||||
|
)
|
||||||
|
return samples, intermediates
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def plms_sampling(self, cond, shape,
|
||||||
|
x_T=None, ddim_use_original_steps=False,
|
||||||
|
callback=None, timesteps=None, quantize_denoised=False,
|
||||||
|
mask=None, x0=None, img_callback=None, log_every_t=100,
|
||||||
|
temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
|
||||||
|
unconditional_guidance_scale=1., unconditional_conditioning=None,
|
||||||
|
dynamic_threshold=None):
|
||||||
|
device = self.model.betas.device
|
||||||
|
b = shape[0]
|
||||||
|
if x_T is None:
|
||||||
|
img = torch.randn(shape, device=device)
|
||||||
|
else:
|
||||||
|
img = x_T
|
||||||
|
|
||||||
|
if timesteps is None:
|
||||||
|
timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
|
||||||
|
elif timesteps is not None and not ddim_use_original_steps:
|
||||||
|
subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
|
||||||
|
timesteps = self.ddim_timesteps[:subset_end]
|
||||||
|
|
||||||
|
intermediates = {'x_inter': [img], 'pred_x0': [img]}
|
||||||
|
time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
|
||||||
|
total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
|
||||||
|
print(f"Running PLMS Sampling with {total_steps} timesteps")
|
||||||
|
|
||||||
|
iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
|
||||||
|
old_eps = []
|
||||||
|
|
||||||
|
for i, step in enumerate(iterator):
|
||||||
|
index = total_steps - i - 1
|
||||||
|
ts = torch.full((b,), step, device=device, dtype=torch.long)
|
||||||
|
ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
|
||||||
|
|
||||||
|
if mask is not None:
|
||||||
|
assert x0 is not None
|
||||||
|
img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
|
||||||
|
img = img_orig * mask + (1. - mask) * img
|
||||||
|
|
||||||
|
outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
|
||||||
|
quantize_denoised=quantize_denoised, temperature=temperature,
|
||||||
|
noise_dropout=noise_dropout, score_corrector=score_corrector,
|
||||||
|
corrector_kwargs=corrector_kwargs,
|
||||||
|
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||||
|
unconditional_conditioning=unconditional_conditioning,
|
||||||
|
old_eps=old_eps, t_next=ts_next,
|
||||||
|
dynamic_threshold=dynamic_threshold)
|
||||||
|
img, pred_x0, e_t = outs
|
||||||
|
old_eps.append(e_t)
|
||||||
|
if len(old_eps) >= 4:
|
||||||
|
old_eps.pop(0)
|
||||||
|
if callback: callback(i)
|
||||||
|
if img_callback: img_callback(pred_x0, i)
|
||||||
|
|
||||||
|
if index % log_every_t == 0 or index == total_steps - 1:
|
||||||
|
intermediates['x_inter'].append(img)
|
||||||
|
intermediates['pred_x0'].append(pred_x0)
|
||||||
|
|
||||||
|
return img, intermediates
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
|
||||||
|
temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
|
||||||
|
unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None,
|
||||||
|
dynamic_threshold=None):
|
||||||
|
b, *_, device = *x.shape, x.device
|
||||||
|
|
||||||
|
def get_model_output(x, t):
|
||||||
|
if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
|
||||||
|
e_t = self.model.apply_model(x, t, c)
|
||||||
|
else:
|
||||||
|
x_in = torch.cat([x] * 2)
|
||||||
|
t_in = torch.cat([t] * 2)
|
||||||
|
c_in = torch.cat([unconditional_conditioning, c])
|
||||||
|
e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
|
||||||
|
e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
|
||||||
|
|
||||||
|
if score_corrector is not None:
|
||||||
|
assert self.model.parameterization == "eps"
|
||||||
|
e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
|
||||||
|
|
||||||
|
return e_t
|
||||||
|
|
||||||
|
alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
|
||||||
|
alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
|
||||||
|
sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
|
||||||
|
sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
|
||||||
|
|
||||||
|
def get_x_prev_and_pred_x0(e_t, index):
|
||||||
|
# select parameters corresponding to the currently considered timestep
|
||||||
|
a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
|
||||||
|
a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
|
||||||
|
sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
|
||||||
|
sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
|
||||||
|
|
||||||
|
# current prediction for x_0
|
||||||
|
pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
|
||||||
|
if quantize_denoised:
|
||||||
|
pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
|
||||||
|
if dynamic_threshold is not None:
|
||||||
|
pred_x0 = norm_thresholding(pred_x0, dynamic_threshold)
|
||||||
|
# direction pointing to x_t
|
||||||
|
dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
|
||||||
|
noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
|
||||||
|
if noise_dropout > 0.:
|
||||||
|
noise = torch.nn.functional.dropout(noise, p=noise_dropout)
|
||||||
|
x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
|
||||||
|
return x_prev, pred_x0
|
||||||
|
|
||||||
|
e_t = get_model_output(x, t)
|
||||||
|
if len(old_eps) == 0:
|
||||||
|
# Pseudo Improved Euler (2nd order)
|
||||||
|
x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
|
||||||
|
e_t_next = get_model_output(x_prev, t_next)
|
||||||
|
e_t_prime = (e_t + e_t_next) / 2
|
||||||
|
elif len(old_eps) == 1:
|
||||||
|
# 2nd order Pseudo Linear Multistep (Adams-Bashforth)
|
||||||
|
e_t_prime = (3 * e_t - old_eps[-1]) / 2
|
||||||
|
elif len(old_eps) == 2:
|
||||||
|
# 3nd order Pseudo Linear Multistep (Adams-Bashforth)
|
||||||
|
e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
|
||||||
|
elif len(old_eps) >= 3:
|
||||||
|
# 4nd order Pseudo Linear Multistep (Adams-Bashforth)
|
||||||
|
e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
|
||||||
|
|
||||||
|
x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
|
||||||
|
|
||||||
|
return x_prev, pred_x0, e_t
|
307
AnyText/AnyText_scripts/ldm/models/diffusion/recognizer.py
Normal file
@ -0,0 +1,307 @@
|
|||||||
|
'''
|
||||||
|
Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
'''
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import math
|
||||||
|
import traceback
|
||||||
|
from easydict import EasyDict as edict
|
||||||
|
import time
|
||||||
|
from .ocr_recog.RecModel import RecModel
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from skimage.transform._geometric import _umeyama as get_sym_mat
|
||||||
|
|
||||||
|
|
||||||
|
def min_bounding_rect(img):
|
||||||
|
ret, thresh = cv2.threshold(img, 127, 255, 0)
|
||||||
|
contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
if len(contours) == 0:
|
||||||
|
print('Bad contours, using fake bbox...')
|
||||||
|
return np.array([[0, 0], [100, 0], [100, 100], [0, 100]])
|
||||||
|
max_contour = max(contours, key=cv2.contourArea)
|
||||||
|
rect = cv2.minAreaRect(max_contour)
|
||||||
|
box = cv2.boxPoints(rect)
|
||||||
|
box = np.int0(box)
|
||||||
|
# sort
|
||||||
|
x_sorted = sorted(box, key=lambda x: x[0])
|
||||||
|
left = x_sorted[:2]
|
||||||
|
right = x_sorted[2:]
|
||||||
|
left = sorted(left, key=lambda x: x[1])
|
||||||
|
(tl, bl) = left
|
||||||
|
right = sorted(right, key=lambda x: x[1])
|
||||||
|
(tr, br) = right
|
||||||
|
if tl[1] > bl[1]:
|
||||||
|
(tl, bl) = (bl, tl)
|
||||||
|
if tr[1] > br[1]:
|
||||||
|
(tr, br) = (br, tr)
|
||||||
|
return np.array([tl, tr, br, bl])
|
||||||
|
|
||||||
|
|
||||||
|
def adjust_image(box, img):
|
||||||
|
pts1 = np.float32([box[0], box[1], box[2], box[3]])
|
||||||
|
width = max(np.linalg.norm(pts1[0]-pts1[1]), np.linalg.norm(pts1[2]-pts1[3]))
|
||||||
|
height = max(np.linalg.norm(pts1[0]-pts1[3]), np.linalg.norm(pts1[1]-pts1[2]))
|
||||||
|
pts2 = np.float32([[0, 0], [width, 0], [width, height], [0, height]])
|
||||||
|
# get transform matrix
|
||||||
|
M = get_sym_mat(pts1, pts2, estimate_scale=True)
|
||||||
|
C, H, W = img.shape
|
||||||
|
T = np.array([[2 / W, 0, -1], [0, 2 / H, -1], [0, 0, 1]])
|
||||||
|
theta = np.linalg.inv(T @ M @ np.linalg.inv(T))
|
||||||
|
theta = torch.from_numpy(theta[:2, :]).unsqueeze(0).type(torch.float32).to(img.device)
|
||||||
|
grid = F.affine_grid(theta, torch.Size([1, C, H, W]), align_corners=True)
|
||||||
|
result = F.grid_sample(img.unsqueeze(0), grid, align_corners=True)
|
||||||
|
result = torch.clamp(result.squeeze(0), 0, 255)
|
||||||
|
# crop
|
||||||
|
result = result[:, :int(height), :int(width)]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
mask: numpy.ndarray, mask of textual, HWC
|
||||||
|
src_img: torch.Tensor, source image, CHW
|
||||||
|
'''
|
||||||
|
def crop_image(src_img, mask):
|
||||||
|
box = min_bounding_rect(mask)
|
||||||
|
result = adjust_image(box, src_img)
|
||||||
|
if len(result.shape) == 2:
|
||||||
|
result = torch.stack([result]*3, axis=-1)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def create_predictor(model_dir=None, model_lang='ch', is_onnx=False):
|
||||||
|
model_file_path = model_dir
|
||||||
|
if model_file_path is not None and not os.path.exists(model_file_path):
|
||||||
|
raise ValueError("not find model file path {}".format(model_file_path))
|
||||||
|
|
||||||
|
if is_onnx:
|
||||||
|
import onnxruntime as ort
|
||||||
|
sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider']) # 'TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'
|
||||||
|
return sess
|
||||||
|
else:
|
||||||
|
if model_lang == 'ch':
|
||||||
|
n_class = 6625
|
||||||
|
elif model_lang == 'en':
|
||||||
|
n_class = 97
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported OCR recog model_lang: {model_lang}")
|
||||||
|
rec_config = edict(
|
||||||
|
in_channels=3,
|
||||||
|
backbone=edict(type='MobileNetV1Enhance', scale=0.5, last_conv_stride=[1, 2], last_pool_type='avg'),
|
||||||
|
neck=edict(type='SequenceEncoder', encoder_type="svtr", dims=64, depth=2, hidden_dims=120, use_guide=True),
|
||||||
|
head=edict(type='CTCHead', fc_decay=0.00001, out_channels=n_class, return_feats=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
rec_model = RecModel(rec_config)
|
||||||
|
if model_file_path is not None:
|
||||||
|
rec_model.load_state_dict(torch.load(model_file_path, map_location="cpu"))
|
||||||
|
rec_model.eval()
|
||||||
|
return rec_model.eval()
|
||||||
|
|
||||||
|
|
||||||
|
def _check_image_file(path):
|
||||||
|
img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff'}
|
||||||
|
return any([path.lower().endswith(e) for e in img_end])
|
||||||
|
|
||||||
|
|
||||||
|
def get_image_file_list(img_file):
|
||||||
|
imgs_lists = []
|
||||||
|
if img_file is None or not os.path.exists(img_file):
|
||||||
|
raise Exception("not found any img file in {}".format(img_file))
|
||||||
|
if os.path.isfile(img_file) and _check_image_file(img_file):
|
||||||
|
imgs_lists.append(img_file)
|
||||||
|
elif os.path.isdir(img_file):
|
||||||
|
for single_file in os.listdir(img_file):
|
||||||
|
file_path = os.path.join(img_file, single_file)
|
||||||
|
if os.path.isfile(file_path) and _check_image_file(file_path):
|
||||||
|
imgs_lists.append(file_path)
|
||||||
|
if len(imgs_lists) == 0:
|
||||||
|
raise Exception("not found any img file in {}".format(img_file))
|
||||||
|
imgs_lists = sorted(imgs_lists)
|
||||||
|
return imgs_lists
|
||||||
|
|
||||||
|
|
||||||
|
class TextRecognizer(object):
|
||||||
|
def __init__(self, args, predictor):
|
||||||
|
self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
|
||||||
|
self.rec_batch_num = args.rec_batch_num
|
||||||
|
self.predictor = predictor
|
||||||
|
self.chars = self.get_char_dict(args.rec_char_dict_path)
|
||||||
|
self.char2id = {x: i for i, x in enumerate(self.chars)}
|
||||||
|
self.is_onnx = not isinstance(self.predictor, torch.nn.Module)
|
||||||
|
self.use_fp16 = args.use_fp16
|
||||||
|
|
||||||
|
# img: CHW
|
||||||
|
def resize_norm_img(self, img, max_wh_ratio):
|
||||||
|
imgC, imgH, imgW = self.rec_image_shape
|
||||||
|
assert imgC == img.shape[0]
|
||||||
|
imgW = int((imgH * max_wh_ratio))
|
||||||
|
|
||||||
|
h, w = img.shape[1:]
|
||||||
|
ratio = w / float(h)
|
||||||
|
if math.ceil(imgH * ratio) > imgW:
|
||||||
|
resized_w = imgW
|
||||||
|
else:
|
||||||
|
resized_w = int(math.ceil(imgH * ratio))
|
||||||
|
resized_image = torch.nn.functional.interpolate(
|
||||||
|
img.unsqueeze(0),
|
||||||
|
size=(imgH, resized_w),
|
||||||
|
mode='bilinear',
|
||||||
|
align_corners=True,
|
||||||
|
)
|
||||||
|
resized_image /= 255.0
|
||||||
|
resized_image -= 0.5
|
||||||
|
resized_image /= 0.5
|
||||||
|
padding_im = torch.zeros((imgC, imgH, imgW), dtype=torch.float32).to(img.device)
|
||||||
|
padding_im[:, :, 0:resized_w] = resized_image[0]
|
||||||
|
return padding_im
|
||||||
|
|
||||||
|
# img_list: list of tensors with shape chw 0-255
|
||||||
|
def pred_imglist(self, img_list, show_debug=False):
|
||||||
|
img_num = len(img_list)
|
||||||
|
assert img_num > 0
|
||||||
|
# Calculate the aspect ratio of all text bars
|
||||||
|
width_list = []
|
||||||
|
for img in img_list:
|
||||||
|
width_list.append(img.shape[2] / float(img.shape[1]))
|
||||||
|
# Sorting can speed up the recognition process
|
||||||
|
indices = torch.from_numpy(np.argsort(np.array(width_list)))
|
||||||
|
batch_num = self.rec_batch_num
|
||||||
|
preds_all = [None] * img_num
|
||||||
|
preds_neck_all = [None] * img_num
|
||||||
|
for beg_img_no in range(0, img_num, batch_num):
|
||||||
|
end_img_no = min(img_num, beg_img_no + batch_num)
|
||||||
|
norm_img_batch = []
|
||||||
|
|
||||||
|
imgC, imgH, imgW = self.rec_image_shape[:3]
|
||||||
|
max_wh_ratio = imgW / imgH
|
||||||
|
for ino in range(beg_img_no, end_img_no):
|
||||||
|
h, w = img_list[indices[ino]].shape[1:]
|
||||||
|
if h > w * 1.2:
|
||||||
|
img = img_list[indices[ino]]
|
||||||
|
img = torch.transpose(img, 1, 2).flip(dims=[1])
|
||||||
|
img_list[indices[ino]] = img
|
||||||
|
h, w = img.shape[1:]
|
||||||
|
# wh_ratio = w * 1.0 / h
|
||||||
|
# max_wh_ratio = max(max_wh_ratio, wh_ratio) # comment to not use different ratio
|
||||||
|
for ino in range(beg_img_no, end_img_no):
|
||||||
|
norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio)
|
||||||
|
if self.use_fp16:
|
||||||
|
norm_img = norm_img.half()
|
||||||
|
norm_img = norm_img.unsqueeze(0)
|
||||||
|
norm_img_batch.append(norm_img)
|
||||||
|
norm_img_batch = torch.cat(norm_img_batch, dim=0)
|
||||||
|
if show_debug:
|
||||||
|
for i in range(len(norm_img_batch)):
|
||||||
|
_img = norm_img_batch[i].permute(1, 2, 0).detach().cpu().numpy()
|
||||||
|
_img = (_img + 0.5)*255
|
||||||
|
_img = _img[:, :, ::-1]
|
||||||
|
file_name = f'{indices[beg_img_no + i]}'
|
||||||
|
if os.path.exists(file_name + '.jpg'):
|
||||||
|
file_name += '_2' # ori image
|
||||||
|
cv2.imwrite(file_name + '.jpg', _img)
|
||||||
|
if self.is_onnx:
|
||||||
|
input_dict = {}
|
||||||
|
input_dict[self.predictor.get_inputs()[0].name] = norm_img_batch.detach().cpu().numpy()
|
||||||
|
outputs = self.predictor.run(None, input_dict)
|
||||||
|
preds = {}
|
||||||
|
preds['ctc'] = torch.from_numpy(outputs[0])
|
||||||
|
preds['ctc_neck'] = [torch.zeros(1)] * img_num
|
||||||
|
else:
|
||||||
|
preds = self.predictor(norm_img_batch)
|
||||||
|
for rno in range(preds['ctc'].shape[0]):
|
||||||
|
preds_all[indices[beg_img_no + rno]] = preds['ctc'][rno]
|
||||||
|
preds_neck_all[indices[beg_img_no + rno]] = preds['ctc_neck'][rno]
|
||||||
|
|
||||||
|
return torch.stack(preds_all, dim=0), torch.stack(preds_neck_all, dim=0)
|
||||||
|
|
||||||
|
def get_char_dict(self, character_dict_path):
|
||||||
|
character_str = []
|
||||||
|
with open(character_dict_path, "rb") as fin:
|
||||||
|
lines = fin.readlines()
|
||||||
|
for line in lines:
|
||||||
|
line = line.decode('utf-8').strip("\n").strip("\r\n")
|
||||||
|
character_str.append(line)
|
||||||
|
dict_character = list(character_str)
|
||||||
|
dict_character = ['sos'] + dict_character + [' '] # eos is space
|
||||||
|
return dict_character
|
||||||
|
|
||||||
|
def get_text(self, order):
|
||||||
|
char_list = [self.chars[text_id] for text_id in order]
|
||||||
|
return ''.join(char_list)
|
||||||
|
|
||||||
|
def decode(self, mat):
|
||||||
|
text_index = mat.detach().cpu().numpy().argmax(axis=1)
|
||||||
|
ignored_tokens = [0]
|
||||||
|
selection = np.ones(len(text_index), dtype=bool)
|
||||||
|
selection[1:] = text_index[1:] != text_index[:-1]
|
||||||
|
for ignored_token in ignored_tokens:
|
||||||
|
selection &= text_index != ignored_token
|
||||||
|
return text_index[selection], np.where(selection)[0]
|
||||||
|
|
||||||
|
def get_ctcloss(self, preds, gt_text, weight):
|
||||||
|
if not isinstance(weight, torch.Tensor):
|
||||||
|
weight = torch.tensor(weight).to(preds.device)
|
||||||
|
ctc_loss = torch.nn.CTCLoss(reduction='none')
|
||||||
|
log_probs = preds.log_softmax(dim=2).permute(1, 0, 2) # NTC-->TNC
|
||||||
|
targets = []
|
||||||
|
target_lengths = []
|
||||||
|
for t in gt_text:
|
||||||
|
targets += [self.char2id.get(i, len(self.chars)-1) for i in t]
|
||||||
|
target_lengths += [len(t)]
|
||||||
|
targets = torch.tensor(targets).to(preds.device)
|
||||||
|
target_lengths = torch.tensor(target_lengths).to(preds.device)
|
||||||
|
input_lengths = torch.tensor([log_probs.shape[0]]*(log_probs.shape[1])).to(preds.device)
|
||||||
|
loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
|
||||||
|
loss = loss / input_lengths * weight
|
||||||
|
return loss
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
rec_model_dir = "./ocr_weights/ppv3_rec.pth"
|
||||||
|
predictor = create_predictor(rec_model_dir)
|
||||||
|
args = edict()
|
||||||
|
args.rec_image_shape = "3, 48, 320"
|
||||||
|
args.rec_char_dict_path = './ocr_weights/ppocr_keys_v1.txt'
|
||||||
|
args.rec_batch_num = 6
|
||||||
|
text_recognizer = TextRecognizer(args, predictor)
|
||||||
|
image_dir = './test_imgs_cn'
|
||||||
|
gt_text = ['韩国小馆']*14
|
||||||
|
|
||||||
|
image_file_list = get_image_file_list(image_dir)
|
||||||
|
valid_image_file_list = []
|
||||||
|
img_list = []
|
||||||
|
|
||||||
|
for image_file in image_file_list:
|
||||||
|
img = cv2.imread(image_file)
|
||||||
|
if img is None:
|
||||||
|
print("error in loading image:{}".format(image_file))
|
||||||
|
continue
|
||||||
|
valid_image_file_list.append(image_file)
|
||||||
|
img_list.append(torch.from_numpy(img).permute(2, 0, 1).float())
|
||||||
|
try:
|
||||||
|
tic = time.time()
|
||||||
|
times = []
|
||||||
|
for i in range(10):
|
||||||
|
preds, _ = text_recognizer.pred_imglist(img_list) # get text
|
||||||
|
preds_all = preds.softmax(dim=2)
|
||||||
|
times += [(time.time()-tic)*1000.]
|
||||||
|
tic = time.time()
|
||||||
|
print(times)
|
||||||
|
print(np.mean(times[1:]) / len(preds_all))
|
||||||
|
weight = np.ones(len(gt_text))
|
||||||
|
loss = text_recognizer.get_ctcloss(preds, gt_text, weight)
|
||||||
|
for i in range(len(valid_image_file_list)):
|
||||||
|
pred = preds_all[i]
|
||||||
|
order, idx = text_recognizer.decode(pred)
|
||||||
|
text = text_recognizer.get_text(order)
|
||||||
|
print(f'{valid_image_file_list[i]}: pred/gt="{text}"/"{gt_text[i]}", loss={loss[i]:.2f}')
|
||||||
|
except Exception as E:
|
||||||
|
print(traceback.format_exc(), E)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -0,0 +1,22 @@
|
|||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def append_dims(x, target_dims):
|
||||||
|
"""Appends dimensions to the end of a tensor until it has target_dims dimensions.
|
||||||
|
From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py"""
|
||||||
|
dims_to_append = target_dims - x.ndim
|
||||||
|
if dims_to_append < 0:
|
||||||
|
raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less')
|
||||||
|
return x[(...,) + (None,) * dims_to_append]
|
||||||
|
|
||||||
|
|
||||||
|
def norm_thresholding(x0, value):
|
||||||
|
s = append_dims(x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim)
|
||||||
|
return x0 * (value / s)
|
||||||
|
|
||||||
|
|
||||||
|
def spatial_norm_thresholding(x0, value):
|
||||||
|
# b c h w
|
||||||
|
s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value)
|
||||||
|
return x0 * (value / s)
|
341
AnyText/AnyText_scripts/ldm/modules/attention.py
Normal file
@ -0,0 +1,341 @@
|
|||||||
|
from inspect import isfunction
|
||||||
|
import math
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from torch import nn, einsum
|
||||||
|
from einops import rearrange, repeat
|
||||||
|
from typing import Optional, Any
|
||||||
|
|
||||||
|
from .diffusionmodules.util import checkpoint
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
import xformers
|
||||||
|
import xformers.ops
|
||||||
|
XFORMERS_IS_AVAILBLE = True
|
||||||
|
except:
|
||||||
|
XFORMERS_IS_AVAILBLE = False
|
||||||
|
|
||||||
|
# CrossAttn precision handling
|
||||||
|
import os
|
||||||
|
_ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")
|
||||||
|
|
||||||
|
def exists(val):
|
||||||
|
return val is not None
|
||||||
|
|
||||||
|
|
||||||
|
def uniq(arr):
|
||||||
|
return{el: True for el in arr}.keys()
|
||||||
|
|
||||||
|
|
||||||
|
def default(val, d):
|
||||||
|
if exists(val):
|
||||||
|
return val
|
||||||
|
return d() if isfunction(d) else d
|
||||||
|
|
||||||
|
|
||||||
|
def max_neg_value(t):
|
||||||
|
return -torch.finfo(t.dtype).max
|
||||||
|
|
||||||
|
|
||||||
|
def init_(tensor):
|
||||||
|
dim = tensor.shape[-1]
|
||||||
|
std = 1 / math.sqrt(dim)
|
||||||
|
tensor.uniform_(-std, std)
|
||||||
|
return tensor
|
||||||
|
|
||||||
|
|
||||||
|
# feedforward
|
||||||
|
class GEGLU(nn.Module):
|
||||||
|
def __init__(self, dim_in, dim_out):
|
||||||
|
super().__init__()
|
||||||
|
self.proj = nn.Linear(dim_in, dim_out * 2)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x, gate = self.proj(x).chunk(2, dim=-1)
|
||||||
|
return x * F.gelu(gate)
|
||||||
|
|
||||||
|
|
||||||
|
class FeedForward(nn.Module):
|
||||||
|
def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
|
||||||
|
super().__init__()
|
||||||
|
inner_dim = int(dim * mult)
|
||||||
|
dim_out = default(dim_out, dim)
|
||||||
|
project_in = nn.Sequential(
|
||||||
|
nn.Linear(dim, inner_dim),
|
||||||
|
nn.GELU()
|
||||||
|
) if not glu else GEGLU(dim, inner_dim)
|
||||||
|
|
||||||
|
self.net = nn.Sequential(
|
||||||
|
project_in,
|
||||||
|
nn.Dropout(dropout),
|
||||||
|
nn.Linear(inner_dim, dim_out)
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.net(x)
|
||||||
|
|
||||||
|
|
||||||
|
def zero_module(module):
|
||||||
|
"""
|
||||||
|
Zero out the parameters of a module and return it.
|
||||||
|
"""
|
||||||
|
for p in module.parameters():
|
||||||
|
p.detach().zero_()
|
||||||
|
return module
|
||||||
|
|
||||||
|
|
||||||
|
def Normalize(in_channels):
|
||||||
|
return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
|
||||||
|
|
||||||
|
|
||||||
|
class SpatialSelfAttention(nn.Module):
|
||||||
|
def __init__(self, in_channels):
|
||||||
|
super().__init__()
|
||||||
|
self.in_channels = in_channels
|
||||||
|
|
||||||
|
self.norm = Normalize(in_channels)
|
||||||
|
self.q = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
self.k = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
self.v = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
self.proj_out = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
h_ = x
|
||||||
|
h_ = self.norm(h_)
|
||||||
|
q = self.q(h_)
|
||||||
|
k = self.k(h_)
|
||||||
|
v = self.v(h_)
|
||||||
|
|
||||||
|
# compute attention
|
||||||
|
b,c,h,w = q.shape
|
||||||
|
q = rearrange(q, 'b c h w -> b (h w) c')
|
||||||
|
k = rearrange(k, 'b c h w -> b c (h w)')
|
||||||
|
w_ = torch.einsum('bij,bjk->bik', q, k)
|
||||||
|
|
||||||
|
w_ = w_ * (int(c)**(-0.5))
|
||||||
|
w_ = torch.nn.functional.softmax(w_, dim=2)
|
||||||
|
|
||||||
|
# attend to values
|
||||||
|
v = rearrange(v, 'b c h w -> b c (h w)')
|
||||||
|
w_ = rearrange(w_, 'b i j -> b j i')
|
||||||
|
h_ = torch.einsum('bij,bjk->bik', v, w_)
|
||||||
|
h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
|
||||||
|
h_ = self.proj_out(h_)
|
||||||
|
|
||||||
|
return x+h_
|
||||||
|
|
||||||
|
|
||||||
|
class CrossAttention(nn.Module):
|
||||||
|
def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
|
||||||
|
super().__init__()
|
||||||
|
inner_dim = dim_head * heads
|
||||||
|
context_dim = default(context_dim, query_dim)
|
||||||
|
|
||||||
|
self.scale = dim_head ** -0.5
|
||||||
|
self.heads = heads
|
||||||
|
|
||||||
|
self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
|
||||||
|
self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
|
||||||
|
self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
|
||||||
|
|
||||||
|
self.to_out = nn.Sequential(
|
||||||
|
nn.Linear(inner_dim, query_dim),
|
||||||
|
nn.Dropout(dropout)
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x, context=None, mask=None):
|
||||||
|
h = self.heads
|
||||||
|
|
||||||
|
q = self.to_q(x)
|
||||||
|
context = default(context, x)
|
||||||
|
k = self.to_k(context)
|
||||||
|
v = self.to_v(context)
|
||||||
|
|
||||||
|
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
|
||||||
|
|
||||||
|
# force cast to fp32 to avoid overflowing
|
||||||
|
if _ATTN_PRECISION =="fp32":
|
||||||
|
with torch.autocast(enabled=False, device_type = 'cuda'):
|
||||||
|
q, k = q.float(), k.float()
|
||||||
|
sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
|
||||||
|
else:
|
||||||
|
sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
|
||||||
|
|
||||||
|
del q, k
|
||||||
|
|
||||||
|
if exists(mask):
|
||||||
|
mask = rearrange(mask, 'b ... -> b (...)')
|
||||||
|
max_neg_value = -torch.finfo(sim.dtype).max
|
||||||
|
mask = repeat(mask, 'b j -> (b h) () j', h=h)
|
||||||
|
sim.masked_fill_(~mask, max_neg_value)
|
||||||
|
|
||||||
|
# attention, what we cannot get enough of
|
||||||
|
sim = sim.softmax(dim=-1)
|
||||||
|
|
||||||
|
out = einsum('b i j, b j d -> b i d', sim, v)
|
||||||
|
out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
|
||||||
|
return self.to_out(out)
|
||||||
|
|
||||||
|
|
||||||
|
class MemoryEfficientCrossAttention(nn.Module):
|
||||||
|
# https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
|
||||||
|
def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
|
||||||
|
super().__init__()
|
||||||
|
print(f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using "
|
||||||
|
f"{heads} heads.")
|
||||||
|
inner_dim = dim_head * heads
|
||||||
|
context_dim = default(context_dim, query_dim)
|
||||||
|
|
||||||
|
self.heads = heads
|
||||||
|
self.dim_head = dim_head
|
||||||
|
|
||||||
|
self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
|
||||||
|
self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
|
||||||
|
self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
|
||||||
|
|
||||||
|
self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
|
||||||
|
self.attention_op: Optional[Any] = None
|
||||||
|
|
||||||
|
def forward(self, x, context=None, mask=None):
|
||||||
|
q = self.to_q(x)
|
||||||
|
context = default(context, x)
|
||||||
|
k = self.to_k(context)
|
||||||
|
v = self.to_v(context)
|
||||||
|
|
||||||
|
b, _, _ = q.shape
|
||||||
|
q, k, v = map(
|
||||||
|
lambda t: t.unsqueeze(3)
|
||||||
|
.reshape(b, t.shape[1], self.heads, self.dim_head)
|
||||||
|
.permute(0, 2, 1, 3)
|
||||||
|
.reshape(b * self.heads, t.shape[1], self.dim_head)
|
||||||
|
.contiguous(),
|
||||||
|
(q, k, v),
|
||||||
|
)
|
||||||
|
|
||||||
|
# actually compute the attention, what we cannot get enough of
|
||||||
|
out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=self.attention_op)
|
||||||
|
|
||||||
|
if exists(mask):
|
||||||
|
raise NotImplementedError
|
||||||
|
out = (
|
||||||
|
out.unsqueeze(0)
|
||||||
|
.reshape(b, self.heads, out.shape[1], self.dim_head)
|
||||||
|
.permute(0, 2, 1, 3)
|
||||||
|
.reshape(b, out.shape[1], self.heads * self.dim_head)
|
||||||
|
)
|
||||||
|
return self.to_out(out)
|
||||||
|
|
||||||
|
|
||||||
|
class BasicTransformerBlock(nn.Module):
|
||||||
|
ATTENTION_MODES = {
|
||||||
|
"softmax": CrossAttention, # vanilla attention
|
||||||
|
"softmax-xformers": MemoryEfficientCrossAttention
|
||||||
|
}
|
||||||
|
def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
|
||||||
|
disable_self_attn=False):
|
||||||
|
super().__init__()
|
||||||
|
attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILBLE else "softmax"
|
||||||
|
assert attn_mode in self.ATTENTION_MODES
|
||||||
|
attn_cls = self.ATTENTION_MODES[attn_mode]
|
||||||
|
self.disable_self_attn = disable_self_attn
|
||||||
|
self.attn1 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
|
||||||
|
context_dim=context_dim if self.disable_self_attn else None) # is a self-attention if not self.disable_self_attn
|
||||||
|
self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
|
||||||
|
self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim,
|
||||||
|
heads=n_heads, dim_head=d_head, dropout=dropout) # is self-attn if context is none
|
||||||
|
self.norm1 = nn.LayerNorm(dim)
|
||||||
|
self.norm2 = nn.LayerNorm(dim)
|
||||||
|
self.norm3 = nn.LayerNorm(dim)
|
||||||
|
self.checkpoint = checkpoint
|
||||||
|
|
||||||
|
def forward(self, x, context=None):
|
||||||
|
return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
|
||||||
|
|
||||||
|
def _forward(self, x, context=None):
|
||||||
|
x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None) + x
|
||||||
|
x = self.attn2(self.norm2(x), context=context) + x
|
||||||
|
x = self.ff(self.norm3(x)) + x
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class SpatialTransformer(nn.Module):
|
||||||
|
"""
|
||||||
|
Transformer block for image-like data.
|
||||||
|
First, project the input (aka embedding)
|
||||||
|
and reshape to b, t, d.
|
||||||
|
Then apply standard transformer action.
|
||||||
|
Finally, reshape to image
|
||||||
|
NEW: use_linear for more efficiency instead of the 1x1 convs
|
||||||
|
"""
|
||||||
|
def __init__(self, in_channels, n_heads, d_head,
|
||||||
|
depth=1, dropout=0., context_dim=None,
|
||||||
|
disable_self_attn=False, use_linear=False,
|
||||||
|
use_checkpoint=True):
|
||||||
|
super().__init__()
|
||||||
|
if exists(context_dim) and not isinstance(context_dim, list):
|
||||||
|
context_dim = [context_dim]
|
||||||
|
self.in_channels = in_channels
|
||||||
|
inner_dim = n_heads * d_head
|
||||||
|
self.norm = Normalize(in_channels)
|
||||||
|
if not use_linear:
|
||||||
|
self.proj_in = nn.Conv2d(in_channels,
|
||||||
|
inner_dim,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
else:
|
||||||
|
self.proj_in = nn.Linear(in_channels, inner_dim)
|
||||||
|
|
||||||
|
self.transformer_blocks = nn.ModuleList(
|
||||||
|
[BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d],
|
||||||
|
disable_self_attn=disable_self_attn, checkpoint=use_checkpoint)
|
||||||
|
for d in range(depth)]
|
||||||
|
)
|
||||||
|
if not use_linear:
|
||||||
|
self.proj_out = zero_module(nn.Conv2d(inner_dim,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0))
|
||||||
|
else:
|
||||||
|
self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
|
||||||
|
self.use_linear = use_linear
|
||||||
|
|
||||||
|
def forward(self, x, context=None):
|
||||||
|
# note: if no context is given, cross-attention defaults to self-attention
|
||||||
|
if not isinstance(context, list):
|
||||||
|
context = [context]
|
||||||
|
b, c, h, w = x.shape
|
||||||
|
x_in = x
|
||||||
|
x = self.norm(x)
|
||||||
|
if not self.use_linear:
|
||||||
|
x = self.proj_in(x)
|
||||||
|
x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
|
||||||
|
if self.use_linear:
|
||||||
|
x = self.proj_in(x)
|
||||||
|
for i, block in enumerate(self.transformer_blocks):
|
||||||
|
x = block(x, context=context[i])
|
||||||
|
if self.use_linear:
|
||||||
|
x = self.proj_out(x)
|
||||||
|
x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
|
||||||
|
if not self.use_linear:
|
||||||
|
x = self.proj_out(x)
|
||||||
|
return x + x_in
|
||||||
|
|
852
AnyText/AnyText_scripts/ldm/modules/diffusionmodules/model.py
Normal file
@ -0,0 +1,852 @@
|
|||||||
|
# pytorch_diffusion + derived encoder decoder
|
||||||
|
import math
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import numpy as np
|
||||||
|
from einops import rearrange
|
||||||
|
from typing import Optional, Any
|
||||||
|
|
||||||
|
from ..attention import MemoryEfficientCrossAttention
|
||||||
|
|
||||||
|
try:
|
||||||
|
import xformers
|
||||||
|
import xformers.ops
|
||||||
|
XFORMERS_IS_AVAILBLE = True
|
||||||
|
except:
|
||||||
|
XFORMERS_IS_AVAILBLE = False
|
||||||
|
print("No module 'xformers'. Proceeding without it.")
|
||||||
|
|
||||||
|
|
||||||
|
def get_timestep_embedding(timesteps, embedding_dim):
|
||||||
|
"""
|
||||||
|
This matches the implementation in Denoising Diffusion Probabilistic Models:
|
||||||
|
From Fairseq.
|
||||||
|
Build sinusoidal embeddings.
|
||||||
|
This matches the implementation in tensor2tensor, but differs slightly
|
||||||
|
from the description in Section 3.5 of "Attention Is All You Need".
|
||||||
|
"""
|
||||||
|
assert len(timesteps.shape) == 1
|
||||||
|
|
||||||
|
half_dim = embedding_dim // 2
|
||||||
|
emb = math.log(10000) / (half_dim - 1)
|
||||||
|
emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
|
||||||
|
emb = emb.to(device=timesteps.device)
|
||||||
|
emb = timesteps.float()[:, None] * emb[None, :]
|
||||||
|
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
||||||
|
if embedding_dim % 2 == 1: # zero pad
|
||||||
|
emb = torch.nn.functional.pad(emb, (0,1,0,0))
|
||||||
|
return emb
|
||||||
|
|
||||||
|
|
||||||
|
def nonlinearity(x):
|
||||||
|
# swish
|
||||||
|
return x*torch.sigmoid(x)
|
||||||
|
|
||||||
|
|
||||||
|
def Normalize(in_channels, num_groups=32):
|
||||||
|
return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
|
||||||
|
|
||||||
|
|
||||||
|
class Upsample(nn.Module):
|
||||||
|
def __init__(self, in_channels, with_conv):
|
||||||
|
super().__init__()
|
||||||
|
self.with_conv = with_conv
|
||||||
|
if self.with_conv:
|
||||||
|
self.conv = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
|
||||||
|
if self.with_conv:
|
||||||
|
x = self.conv(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class Downsample(nn.Module):
|
||||||
|
def __init__(self, in_channels, with_conv):
|
||||||
|
super().__init__()
|
||||||
|
self.with_conv = with_conv
|
||||||
|
if self.with_conv:
|
||||||
|
# no asymmetric padding in torch conv, must do it ourselves
|
||||||
|
self.conv = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=0)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.with_conv:
|
||||||
|
pad = (0,1,0,1)
|
||||||
|
x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
|
||||||
|
x = self.conv(x)
|
||||||
|
else:
|
||||||
|
x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class ResnetBlock(nn.Module):
|
||||||
|
def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
|
||||||
|
dropout, temb_channels=512):
|
||||||
|
super().__init__()
|
||||||
|
self.in_channels = in_channels
|
||||||
|
out_channels = in_channels if out_channels is None else out_channels
|
||||||
|
self.out_channels = out_channels
|
||||||
|
self.use_conv_shortcut = conv_shortcut
|
||||||
|
|
||||||
|
self.norm1 = Normalize(in_channels)
|
||||||
|
self.conv1 = torch.nn.Conv2d(in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1)
|
||||||
|
if temb_channels > 0:
|
||||||
|
self.temb_proj = torch.nn.Linear(temb_channels,
|
||||||
|
out_channels)
|
||||||
|
self.norm2 = Normalize(out_channels)
|
||||||
|
self.dropout = torch.nn.Dropout(dropout)
|
||||||
|
self.conv2 = torch.nn.Conv2d(out_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1)
|
||||||
|
if self.in_channels != self.out_channels:
|
||||||
|
if self.use_conv_shortcut:
|
||||||
|
self.conv_shortcut = torch.nn.Conv2d(in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1)
|
||||||
|
else:
|
||||||
|
self.nin_shortcut = torch.nn.Conv2d(in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
|
||||||
|
def forward(self, x, temb):
|
||||||
|
h = x
|
||||||
|
h = self.norm1(h)
|
||||||
|
h = nonlinearity(h)
|
||||||
|
h = self.conv1(h)
|
||||||
|
|
||||||
|
if temb is not None:
|
||||||
|
h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
|
||||||
|
|
||||||
|
h = self.norm2(h)
|
||||||
|
h = nonlinearity(h)
|
||||||
|
h = self.dropout(h)
|
||||||
|
h = self.conv2(h)
|
||||||
|
|
||||||
|
if self.in_channels != self.out_channels:
|
||||||
|
if self.use_conv_shortcut:
|
||||||
|
x = self.conv_shortcut(x)
|
||||||
|
else:
|
||||||
|
x = self.nin_shortcut(x)
|
||||||
|
|
||||||
|
return x+h
|
||||||
|
|
||||||
|
|
||||||
|
class AttnBlock(nn.Module):
|
||||||
|
def __init__(self, in_channels):
|
||||||
|
super().__init__()
|
||||||
|
self.in_channels = in_channels
|
||||||
|
|
||||||
|
self.norm = Normalize(in_channels)
|
||||||
|
self.q = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
self.k = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
self.v = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
self.proj_out = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
h_ = x
|
||||||
|
h_ = self.norm(h_)
|
||||||
|
q = self.q(h_)
|
||||||
|
k = self.k(h_)
|
||||||
|
v = self.v(h_)
|
||||||
|
|
||||||
|
# compute attention
|
||||||
|
b,c,h,w = q.shape
|
||||||
|
q = q.reshape(b,c,h*w)
|
||||||
|
q = q.permute(0,2,1) # b,hw,c
|
||||||
|
k = k.reshape(b,c,h*w) # b,c,hw
|
||||||
|
w_ = torch.bmm(q,k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
|
||||||
|
w_ = w_ * (int(c)**(-0.5))
|
||||||
|
w_ = torch.nn.functional.softmax(w_, dim=2)
|
||||||
|
|
||||||
|
# attend to values
|
||||||
|
v = v.reshape(b,c,h*w)
|
||||||
|
w_ = w_.permute(0,2,1) # b,hw,hw (first hw of k, second of q)
|
||||||
|
h_ = torch.bmm(v,w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
|
||||||
|
h_ = h_.reshape(b,c,h,w)
|
||||||
|
|
||||||
|
h_ = self.proj_out(h_)
|
||||||
|
|
||||||
|
return x+h_
|
||||||
|
|
||||||
|
class MemoryEfficientAttnBlock(nn.Module):
|
||||||
|
"""
|
||||||
|
Uses xformers efficient implementation,
|
||||||
|
see https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
|
||||||
|
Note: this is a single-head self-attention operation
|
||||||
|
"""
|
||||||
|
#
|
||||||
|
def __init__(self, in_channels):
|
||||||
|
super().__init__()
|
||||||
|
self.in_channels = in_channels
|
||||||
|
|
||||||
|
self.norm = Normalize(in_channels)
|
||||||
|
self.q = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
self.k = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
self.v = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
self.proj_out = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0)
|
||||||
|
self.attention_op: Optional[Any] = None
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
h_ = x
|
||||||
|
h_ = self.norm(h_)
|
||||||
|
q = self.q(h_)
|
||||||
|
k = self.k(h_)
|
||||||
|
v = self.v(h_)
|
||||||
|
|
||||||
|
# compute attention
|
||||||
|
B, C, H, W = q.shape
|
||||||
|
q, k, v = map(lambda x: rearrange(x, 'b c h w -> b (h w) c'), (q, k, v))
|
||||||
|
|
||||||
|
q, k, v = map(
|
||||||
|
lambda t: t.unsqueeze(3)
|
||||||
|
.reshape(B, t.shape[1], 1, C)
|
||||||
|
.permute(0, 2, 1, 3)
|
||||||
|
.reshape(B * 1, t.shape[1], C)
|
||||||
|
.contiguous(),
|
||||||
|
(q, k, v),
|
||||||
|
)
|
||||||
|
out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=self.attention_op)
|
||||||
|
|
||||||
|
out = (
|
||||||
|
out.unsqueeze(0)
|
||||||
|
.reshape(B, 1, out.shape[1], C)
|
||||||
|
.permute(0, 2, 1, 3)
|
||||||
|
.reshape(B, out.shape[1], C)
|
||||||
|
)
|
||||||
|
out = rearrange(out, 'b (h w) c -> b c h w', b=B, h=H, w=W, c=C)
|
||||||
|
out = self.proj_out(out)
|
||||||
|
return x+out
|
||||||
|
|
||||||
|
|
||||||
|
class MemoryEfficientCrossAttentionWrapper(MemoryEfficientCrossAttention):
|
||||||
|
def forward(self, x, context=None, mask=None):
|
||||||
|
b, c, h, w = x.shape
|
||||||
|
x = rearrange(x, 'b c h w -> b (h w) c')
|
||||||
|
out = super().forward(x, context=context, mask=mask)
|
||||||
|
out = rearrange(out, 'b (h w) c -> b c h w', h=h, w=w, c=c)
|
||||||
|
return x + out
|
||||||
|
|
||||||
|
|
||||||
|
def make_attn(in_channels, attn_type="vanilla", attn_kwargs=None):
|
||||||
|
assert attn_type in ["vanilla", "vanilla-xformers", "memory-efficient-cross-attn", "linear", "none"], f'attn_type {attn_type} unknown'
|
||||||
|
if XFORMERS_IS_AVAILBLE and attn_type == "vanilla":
|
||||||
|
attn_type = "vanilla-xformers"
|
||||||
|
print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
|
||||||
|
if attn_type == "vanilla":
|
||||||
|
assert attn_kwargs is None
|
||||||
|
return AttnBlock(in_channels)
|
||||||
|
elif attn_type == "vanilla-xformers":
|
||||||
|
print(f"building MemoryEfficientAttnBlock with {in_channels} in_channels...")
|
||||||
|
return MemoryEfficientAttnBlock(in_channels)
|
||||||
|
elif type == "memory-efficient-cross-attn":
|
||||||
|
attn_kwargs["query_dim"] = in_channels
|
||||||
|
return MemoryEfficientCrossAttentionWrapper(**attn_kwargs)
|
||||||
|
elif attn_type == "none":
|
||||||
|
return nn.Identity(in_channels)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
class Model(nn.Module):
|
||||||
|
def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
|
||||||
|
attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
|
||||||
|
resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"):
|
||||||
|
super().__init__()
|
||||||
|
if use_linear_attn: attn_type = "linear"
|
||||||
|
self.ch = ch
|
||||||
|
self.temb_ch = self.ch*4
|
||||||
|
self.num_resolutions = len(ch_mult)
|
||||||
|
self.num_res_blocks = num_res_blocks
|
||||||
|
self.resolution = resolution
|
||||||
|
self.in_channels = in_channels
|
||||||
|
|
||||||
|
self.use_timestep = use_timestep
|
||||||
|
if self.use_timestep:
|
||||||
|
# timestep embedding
|
||||||
|
self.temb = nn.Module()
|
||||||
|
self.temb.dense = nn.ModuleList([
|
||||||
|
torch.nn.Linear(self.ch,
|
||||||
|
self.temb_ch),
|
||||||
|
torch.nn.Linear(self.temb_ch,
|
||||||
|
self.temb_ch),
|
||||||
|
])
|
||||||
|
|
||||||
|
# downsampling
|
||||||
|
self.conv_in = torch.nn.Conv2d(in_channels,
|
||||||
|
self.ch,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1)
|
||||||
|
|
||||||
|
curr_res = resolution
|
||||||
|
in_ch_mult = (1,)+tuple(ch_mult)
|
||||||
|
self.down = nn.ModuleList()
|
||||||
|
for i_level in range(self.num_resolutions):
|
||||||
|
block = nn.ModuleList()
|
||||||
|
attn = nn.ModuleList()
|
||||||
|
block_in = ch*in_ch_mult[i_level]
|
||||||
|
block_out = ch*ch_mult[i_level]
|
||||||
|
for i_block in range(self.num_res_blocks):
|
||||||
|
block.append(ResnetBlock(in_channels=block_in,
|
||||||
|
out_channels=block_out,
|
||||||
|
temb_channels=self.temb_ch,
|
||||||
|
dropout=dropout))
|
||||||
|
block_in = block_out
|
||||||
|
if curr_res in attn_resolutions:
|
||||||
|
attn.append(make_attn(block_in, attn_type=attn_type))
|
||||||
|
down = nn.Module()
|
||||||
|
down.block = block
|
||||||
|
down.attn = attn
|
||||||
|
if i_level != self.num_resolutions-1:
|
||||||
|
down.downsample = Downsample(block_in, resamp_with_conv)
|
||||||
|
curr_res = curr_res // 2
|
||||||
|
self.down.append(down)
|
||||||
|
|
||||||
|
# middle
|
||||||
|
self.mid = nn.Module()
|
||||||
|
self.mid.block_1 = ResnetBlock(in_channels=block_in,
|
||||||
|
out_channels=block_in,
|
||||||
|
temb_channels=self.temb_ch,
|
||||||
|
dropout=dropout)
|
||||||
|
self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
|
||||||
|
self.mid.block_2 = ResnetBlock(in_channels=block_in,
|
||||||
|
out_channels=block_in,
|
||||||
|
temb_channels=self.temb_ch,
|
||||||
|
dropout=dropout)
|
||||||
|
|
||||||
|
# upsampling
|
||||||
|
self.up = nn.ModuleList()
|
||||||
|
for i_level in reversed(range(self.num_resolutions)):
|
||||||
|
block = nn.ModuleList()
|
||||||
|
attn = nn.ModuleList()
|
||||||
|
block_out = ch*ch_mult[i_level]
|
||||||
|
skip_in = ch*ch_mult[i_level]
|
||||||
|
for i_block in range(self.num_res_blocks+1):
|
||||||
|
if i_block == self.num_res_blocks:
|
||||||
|
skip_in = ch*in_ch_mult[i_level]
|
||||||
|
block.append(ResnetBlock(in_channels=block_in+skip_in,
|
||||||
|
out_channels=block_out,
|
||||||
|
temb_channels=self.temb_ch,
|
||||||
|
dropout=dropout))
|
||||||
|
block_in = block_out
|
||||||
|
if curr_res in attn_resolutions:
|
||||||
|
attn.append(make_attn(block_in, attn_type=attn_type))
|
||||||
|
up = nn.Module()
|
||||||
|
up.block = block
|
||||||
|
up.attn = attn
|
||||||
|
if i_level != 0:
|
||||||
|
up.upsample = Upsample(block_in, resamp_with_conv)
|
||||||
|
curr_res = curr_res * 2
|
||||||
|
self.up.insert(0, up) # prepend to get consistent order
|
||||||
|
|
||||||
|
# end
|
||||||
|
self.norm_out = Normalize(block_in)
|
||||||
|
self.conv_out = torch.nn.Conv2d(block_in,
|
||||||
|
out_ch,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1)
|
||||||
|
|
||||||
|
def forward(self, x, t=None, context=None):
|
||||||
|
#assert x.shape[2] == x.shape[3] == self.resolution
|
||||||
|
if context is not None:
|
||||||
|
# assume aligned context, cat along channel axis
|
||||||
|
x = torch.cat((x, context), dim=1)
|
||||||
|
if self.use_timestep:
|
||||||
|
# timestep embedding
|
||||||
|
assert t is not None
|
||||||
|
temb = get_timestep_embedding(t, self.ch)
|
||||||
|
temb = self.temb.dense[0](temb)
|
||||||
|
temb = nonlinearity(temb)
|
||||||
|
temb = self.temb.dense[1](temb)
|
||||||
|
else:
|
||||||
|
temb = None
|
||||||
|
|
||||||
|
# downsampling
|
||||||
|
hs = [self.conv_in(x)]
|
||||||
|
for i_level in range(self.num_resolutions):
|
||||||
|
for i_block in range(self.num_res_blocks):
|
||||||
|
h = self.down[i_level].block[i_block](hs[-1], temb)
|
||||||
|
if len(self.down[i_level].attn) > 0:
|
||||||
|
h = self.down[i_level].attn[i_block](h)
|
||||||
|
hs.append(h)
|
||||||
|
if i_level != self.num_resolutions-1:
|
||||||
|
hs.append(self.down[i_level].downsample(hs[-1]))
|
||||||
|
|
||||||
|
# middle
|
||||||
|
h = hs[-1]
|
||||||
|
h = self.mid.block_1(h, temb)
|
||||||
|
h = self.mid.attn_1(h)
|
||||||
|
h = self.mid.block_2(h, temb)
|
||||||
|
|
||||||
|
# upsampling
|
||||||
|
for i_level in reversed(range(self.num_resolutions)):
|
||||||
|
for i_block in range(self.num_res_blocks+1):
|
||||||
|
h = self.up[i_level].block[i_block](
|
||||||
|
torch.cat([h, hs.pop()], dim=1), temb)
|
||||||
|
if len(self.up[i_level].attn) > 0:
|
||||||
|
h = self.up[i_level].attn[i_block](h)
|
||||||
|
if i_level != 0:
|
||||||
|
h = self.up[i_level].upsample(h)
|
||||||
|
|
||||||
|
# end
|
||||||
|
h = self.norm_out(h)
|
||||||
|
h = nonlinearity(h)
|
||||||
|
h = self.conv_out(h)
|
||||||
|
return h
|
||||||
|
|
||||||
|
def get_last_layer(self):
|
||||||
|
return self.conv_out.weight
|
||||||
|
|
||||||
|
|
||||||
|
class Encoder(nn.Module):
|
||||||
|
def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
|
||||||
|
attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
|
||||||
|
resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
|
||||||
|
**ignore_kwargs):
|
||||||
|
super().__init__()
|
||||||
|
if use_linear_attn: attn_type = "linear"
|
||||||
|
self.ch = ch
|
||||||
|
self.temb_ch = 0
|
||||||
|
self.num_resolutions = len(ch_mult)
|
||||||
|
self.num_res_blocks = num_res_blocks
|
||||||
|
self.resolution = resolution
|
||||||
|
self.in_channels = in_channels
|
||||||
|
|
||||||
|
# downsampling
|
||||||
|
self.conv_in = torch.nn.Conv2d(in_channels,
|
||||||
|
self.ch,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1)
|
||||||
|
|
||||||
|
curr_res = resolution
|
||||||
|
in_ch_mult = (1,)+tuple(ch_mult)
|
||||||
|
self.in_ch_mult = in_ch_mult
|
||||||
|
self.down = nn.ModuleList()
|
||||||
|
for i_level in range(self.num_resolutions):
|
||||||
|
block = nn.ModuleList()
|
||||||
|
attn = nn.ModuleList()
|
||||||
|
block_in = ch*in_ch_mult[i_level]
|
||||||
|
block_out = ch*ch_mult[i_level]
|
||||||
|
for i_block in range(self.num_res_blocks):
|
||||||
|
block.append(ResnetBlock(in_channels=block_in,
|
||||||
|
out_channels=block_out,
|
||||||
|
temb_channels=self.temb_ch,
|
||||||
|
dropout=dropout))
|
||||||
|
block_in = block_out
|
||||||
|
if curr_res in attn_resolutions:
|
||||||
|
attn.append(make_attn(block_in, attn_type=attn_type))
|
||||||
|
down = nn.Module()
|
||||||
|
down.block = block
|
||||||
|
down.attn = attn
|
||||||
|
if i_level != self.num_resolutions-1:
|
||||||
|
down.downsample = Downsample(block_in, resamp_with_conv)
|
||||||
|
curr_res = curr_res // 2
|
||||||
|
self.down.append(down)
|
||||||
|
|
||||||
|
# middle
|
||||||
|
self.mid = nn.Module()
|
||||||
|
self.mid.block_1 = ResnetBlock(in_channels=block_in,
|
||||||
|
out_channels=block_in,
|
||||||
|
temb_channels=self.temb_ch,
|
||||||
|
dropout=dropout)
|
||||||
|
self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
|
||||||
|
self.mid.block_2 = ResnetBlock(in_channels=block_in,
|
||||||
|
out_channels=block_in,
|
||||||
|
temb_channels=self.temb_ch,
|
||||||
|
dropout=dropout)
|
||||||
|
|
||||||
|
# end
|
||||||
|
self.norm_out = Normalize(block_in)
|
||||||
|
self.conv_out = torch.nn.Conv2d(block_in,
|
||||||
|
2*z_channels if double_z else z_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# timestep embedding
|
||||||
|
temb = None
|
||||||
|
|
||||||
|
# downsampling
|
||||||
|
hs = [self.conv_in(x)]
|
||||||
|
for i_level in range(self.num_resolutions):
|
||||||
|
for i_block in range(self.num_res_blocks):
|
||||||
|
h = self.down[i_level].block[i_block](hs[-1], temb)
|
||||||
|
if len(self.down[i_level].attn) > 0:
|
||||||
|
h = self.down[i_level].attn[i_block](h)
|
||||||
|
hs.append(h)
|
||||||
|
if i_level != self.num_resolutions-1:
|
||||||
|
hs.append(self.down[i_level].downsample(hs[-1]))
|
||||||
|
|
||||||
|
# middle
|
||||||
|
h = hs[-1]
|
||||||
|
h = self.mid.block_1(h, temb)
|
||||||
|
h = self.mid.attn_1(h)
|
||||||
|
h = self.mid.block_2(h, temb)
|
||||||
|
|
||||||
|
# end
|
||||||
|
h = self.norm_out(h)
|
||||||
|
h = nonlinearity(h)
|
||||||
|
h = self.conv_out(h)
|
||||||
|
return h
|
||||||
|
|
||||||
|
|
||||||
|
class Decoder(nn.Module):
|
||||||
|
def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
|
||||||
|
attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
|
||||||
|
resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
|
||||||
|
attn_type="vanilla", **ignorekwargs):
|
||||||
|
super().__init__()
|
||||||
|
if use_linear_attn: attn_type = "linear"
|
||||||
|
self.ch = ch
|
||||||
|
self.temb_ch = 0
|
||||||
|
self.num_resolutions = len(ch_mult)
|
||||||
|
self.num_res_blocks = num_res_blocks
|
||||||
|
self.resolution = resolution
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.give_pre_end = give_pre_end
|
||||||
|
self.tanh_out = tanh_out
|
||||||
|
|
||||||
|
# compute in_ch_mult, block_in and curr_res at lowest res
|
||||||
|
in_ch_mult = (1,)+tuple(ch_mult)
|
||||||
|
block_in = ch*ch_mult[self.num_resolutions-1]
|
||||||
|
curr_res = resolution // 2**(self.num_resolutions-1)
|
||||||
|
self.z_shape = (1,z_channels,curr_res,curr_res)
|
||||||
|
print("Working with z of shape {} = {} dimensions.".format(
|
||||||
|
self.z_shape, np.prod(self.z_shape)))
|
||||||
|
|
||||||
|
# z to block_in
|
||||||
|
self.conv_in = torch.nn.Conv2d(z_channels,
|
||||||
|
block_in,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1)
|
||||||
|
|
||||||
|
# middle
|
||||||
|
self.mid = nn.Module()
|
||||||
|
self.mid.block_1 = ResnetBlock(in_channels=block_in,
|
||||||
|
out_channels=block_in,
|
||||||
|
temb_channels=self.temb_ch,
|
||||||
|
dropout=dropout)
|
||||||
|
self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
|
||||||
|
self.mid.block_2 = ResnetBlock(in_channels=block_in,
|
||||||
|
out_channels=block_in,
|
||||||
|
temb_channels=self.temb_ch,
|
||||||
|
dropout=dropout)
|
||||||
|
|
||||||
|
# upsampling
|
||||||
|
self.up = nn.ModuleList()
|
||||||
|
for i_level in reversed(range(self.num_resolutions)):
|
||||||
|
block = nn.ModuleList()
|
||||||
|
attn = nn.ModuleList()
|
||||||
|
block_out = ch*ch_mult[i_level]
|
||||||
|
for i_block in range(self.num_res_blocks+1):
|
||||||
|
block.append(ResnetBlock(in_channels=block_in,
|
||||||
|
out_channels=block_out,
|
||||||
|
temb_channels=self.temb_ch,
|
||||||
|
dropout=dropout))
|
||||||
|
block_in = block_out
|
||||||
|
if curr_res in attn_resolutions:
|
||||||
|
attn.append(make_attn(block_in, attn_type=attn_type))
|
||||||
|
up = nn.Module()
|
||||||
|
up.block = block
|
||||||
|
up.attn = attn
|
||||||
|
if i_level != 0:
|
||||||
|
up.upsample = Upsample(block_in, resamp_with_conv)
|
||||||
|
curr_res = curr_res * 2
|
||||||
|
self.up.insert(0, up) # prepend to get consistent order
|
||||||
|
|
||||||
|
# end
|
||||||
|
self.norm_out = Normalize(block_in)
|
||||||
|
self.conv_out = torch.nn.Conv2d(block_in,
|
||||||
|
out_ch,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1)
|
||||||
|
|
||||||
|
def forward(self, z):
|
||||||
|
#assert z.shape[1:] == self.z_shape[1:]
|
||||||
|
self.last_z_shape = z.shape
|
||||||
|
|
||||||
|
# timestep embedding
|
||||||
|
temb = None
|
||||||
|
|
||||||
|
# z to block_in
|
||||||
|
h = self.conv_in(z)
|
||||||
|
|
||||||
|
# middle
|
||||||
|
h = self.mid.block_1(h, temb)
|
||||||
|
h = self.mid.attn_1(h)
|
||||||
|
h = self.mid.block_2(h, temb)
|
||||||
|
|
||||||
|
# upsampling
|
||||||
|
for i_level in reversed(range(self.num_resolutions)):
|
||||||
|
for i_block in range(self.num_res_blocks+1):
|
||||||
|
h = self.up[i_level].block[i_block](h, temb)
|
||||||
|
if len(self.up[i_level].attn) > 0:
|
||||||
|
h = self.up[i_level].attn[i_block](h)
|
||||||
|
if i_level != 0:
|
||||||
|
h = self.up[i_level].upsample(h)
|
||||||
|
|
||||||
|
# end
|
||||||
|
if self.give_pre_end:
|
||||||
|
return h
|
||||||
|
|
||||||
|
h = self.norm_out(h)
|
||||||
|
h = nonlinearity(h)
|
||||||
|
h = self.conv_out(h)
|
||||||
|
if self.tanh_out:
|
||||||
|
h = torch.tanh(h)
|
||||||
|
return h
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleDecoder(nn.Module):
|
||||||
|
def __init__(self, in_channels, out_channels, *args, **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
|
||||||
|
ResnetBlock(in_channels=in_channels,
|
||||||
|
out_channels=2 * in_channels,
|
||||||
|
temb_channels=0, dropout=0.0),
|
||||||
|
ResnetBlock(in_channels=2 * in_channels,
|
||||||
|
out_channels=4 * in_channels,
|
||||||
|
temb_channels=0, dropout=0.0),
|
||||||
|
ResnetBlock(in_channels=4 * in_channels,
|
||||||
|
out_channels=2 * in_channels,
|
||||||
|
temb_channels=0, dropout=0.0),
|
||||||
|
nn.Conv2d(2*in_channels, in_channels, 1),
|
||||||
|
Upsample(in_channels, with_conv=True)])
|
||||||
|
# end
|
||||||
|
self.norm_out = Normalize(in_channels)
|
||||||
|
self.conv_out = torch.nn.Conv2d(in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
for i, layer in enumerate(self.model):
|
||||||
|
if i in [1,2,3]:
|
||||||
|
x = layer(x, None)
|
||||||
|
else:
|
||||||
|
x = layer(x)
|
||||||
|
|
||||||
|
h = self.norm_out(x)
|
||||||
|
h = nonlinearity(h)
|
||||||
|
x = self.conv_out(h)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class UpsampleDecoder(nn.Module):
|
||||||
|
def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
|
||||||
|
ch_mult=(2,2), dropout=0.0):
|
||||||
|
super().__init__()
|
||||||
|
# upsampling
|
||||||
|
self.temb_ch = 0
|
||||||
|
self.num_resolutions = len(ch_mult)
|
||||||
|
self.num_res_blocks = num_res_blocks
|
||||||
|
block_in = in_channels
|
||||||
|
curr_res = resolution // 2 ** (self.num_resolutions - 1)
|
||||||
|
self.res_blocks = nn.ModuleList()
|
||||||
|
self.upsample_blocks = nn.ModuleList()
|
||||||
|
for i_level in range(self.num_resolutions):
|
||||||
|
res_block = []
|
||||||
|
block_out = ch * ch_mult[i_level]
|
||||||
|
for i_block in range(self.num_res_blocks + 1):
|
||||||
|
res_block.append(ResnetBlock(in_channels=block_in,
|
||||||
|
out_channels=block_out,
|
||||||
|
temb_channels=self.temb_ch,
|
||||||
|
dropout=dropout))
|
||||||
|
block_in = block_out
|
||||||
|
self.res_blocks.append(nn.ModuleList(res_block))
|
||||||
|
if i_level != self.num_resolutions - 1:
|
||||||
|
self.upsample_blocks.append(Upsample(block_in, True))
|
||||||
|
curr_res = curr_res * 2
|
||||||
|
|
||||||
|
# end
|
||||||
|
self.norm_out = Normalize(block_in)
|
||||||
|
self.conv_out = torch.nn.Conv2d(block_in,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# upsampling
|
||||||
|
h = x
|
||||||
|
for k, i_level in enumerate(range(self.num_resolutions)):
|
||||||
|
for i_block in range(self.num_res_blocks + 1):
|
||||||
|
h = self.res_blocks[i_level][i_block](h, None)
|
||||||
|
if i_level != self.num_resolutions - 1:
|
||||||
|
h = self.upsample_blocks[k](h)
|
||||||
|
h = self.norm_out(h)
|
||||||
|
h = nonlinearity(h)
|
||||||
|
h = self.conv_out(h)
|
||||||
|
return h
|
||||||
|
|
||||||
|
|
||||||
|
class LatentRescaler(nn.Module):
|
||||||
|
def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
|
||||||
|
super().__init__()
|
||||||
|
# residual block, interpolate, residual block
|
||||||
|
self.factor = factor
|
||||||
|
self.conv_in = nn.Conv2d(in_channels,
|
||||||
|
mid_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1)
|
||||||
|
self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
|
||||||
|
out_channels=mid_channels,
|
||||||
|
temb_channels=0,
|
||||||
|
dropout=0.0) for _ in range(depth)])
|
||||||
|
self.attn = AttnBlock(mid_channels)
|
||||||
|
self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
|
||||||
|
out_channels=mid_channels,
|
||||||
|
temb_channels=0,
|
||||||
|
dropout=0.0) for _ in range(depth)])
|
||||||
|
|
||||||
|
self.conv_out = nn.Conv2d(mid_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.conv_in(x)
|
||||||
|
for block in self.res_block1:
|
||||||
|
x = block(x, None)
|
||||||
|
x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor))))
|
||||||
|
x = self.attn(x)
|
||||||
|
for block in self.res_block2:
|
||||||
|
x = block(x, None)
|
||||||
|
x = self.conv_out(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class MergedRescaleEncoder(nn.Module):
|
||||||
|
def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks,
|
||||||
|
attn_resolutions, dropout=0.0, resamp_with_conv=True,
|
||||||
|
ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1):
|
||||||
|
super().__init__()
|
||||||
|
intermediate_chn = ch * ch_mult[-1]
|
||||||
|
self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult,
|
||||||
|
z_channels=intermediate_chn, double_z=False, resolution=resolution,
|
||||||
|
attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv,
|
||||||
|
out_ch=None)
|
||||||
|
self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn,
|
||||||
|
mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.encoder(x)
|
||||||
|
x = self.rescaler(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class MergedRescaleDecoder(nn.Module):
|
||||||
|
def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8),
|
||||||
|
dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1):
|
||||||
|
super().__init__()
|
||||||
|
tmp_chn = z_channels*ch_mult[-1]
|
||||||
|
self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout,
|
||||||
|
resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks,
|
||||||
|
ch_mult=ch_mult, resolution=resolution, ch=ch)
|
||||||
|
self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn,
|
||||||
|
out_channels=tmp_chn, depth=rescale_module_depth)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.rescaler(x)
|
||||||
|
x = self.decoder(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class Upsampler(nn.Module):
|
||||||
|
def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2):
|
||||||
|
super().__init__()
|
||||||
|
assert out_size >= in_size
|
||||||
|
num_blocks = int(np.log2(out_size//in_size))+1
|
||||||
|
factor_up = 1.+ (out_size % in_size)
|
||||||
|
print(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}")
|
||||||
|
self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels,
|
||||||
|
out_channels=in_channels)
|
||||||
|
self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2,
|
||||||
|
attn_resolutions=[], in_channels=None, ch=in_channels,
|
||||||
|
ch_mult=[ch_mult for _ in range(num_blocks)])
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.rescaler(x)
|
||||||
|
x = self.decoder(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class Resize(nn.Module):
|
||||||
|
def __init__(self, in_channels=None, learned=False, mode="bilinear"):
|
||||||
|
super().__init__()
|
||||||
|
self.with_conv = learned
|
||||||
|
self.mode = mode
|
||||||
|
if self.with_conv:
|
||||||
|
print(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode")
|
||||||
|
raise NotImplementedError()
|
||||||
|
assert in_channels is not None
|
||||||
|
# no asymmetric padding in torch conv, must do it ourselves
|
||||||
|
self.conv = torch.nn.Conv2d(in_channels,
|
||||||
|
in_channels,
|
||||||
|
kernel_size=4,
|
||||||
|
stride=2,
|
||||||
|
padding=1)
|
||||||
|
|
||||||
|
def forward(self, x, scale_factor=1.0):
|
||||||
|
if scale_factor==1.0:
|
||||||
|
return x
|
||||||
|
else:
|
||||||
|
x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor)
|
||||||
|
return x
|
@ -0,0 +1,786 @@
|
|||||||
|
from abc import abstractmethod
|
||||||
|
import math
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch as th
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
from .util import (
|
||||||
|
checkpoint,
|
||||||
|
conv_nd,
|
||||||
|
linear,
|
||||||
|
avg_pool_nd,
|
||||||
|
zero_module,
|
||||||
|
normalization,
|
||||||
|
timestep_embedding,
|
||||||
|
)
|
||||||
|
from ..attention import SpatialTransformer
|
||||||
|
from ...util import exists
|
||||||
|
|
||||||
|
|
||||||
|
# dummy replace
|
||||||
|
def convert_module_to_f16(x):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def convert_module_to_f32(x):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
## go
|
||||||
|
class AttentionPool2d(nn.Module):
|
||||||
|
"""
|
||||||
|
Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
spacial_dim: int,
|
||||||
|
embed_dim: int,
|
||||||
|
num_heads_channels: int,
|
||||||
|
output_dim: int = None,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
|
||||||
|
self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
|
||||||
|
self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
|
||||||
|
self.num_heads = embed_dim // num_heads_channels
|
||||||
|
self.attention = QKVAttention(self.num_heads)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
b, c, *_spatial = x.shape
|
||||||
|
x = x.reshape(b, c, -1) # NC(HW)
|
||||||
|
x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1)
|
||||||
|
x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1)
|
||||||
|
x = self.qkv_proj(x)
|
||||||
|
x = self.attention(x)
|
||||||
|
x = self.c_proj(x)
|
||||||
|
return x[:, :, 0]
|
||||||
|
|
||||||
|
|
||||||
|
class TimestepBlock(nn.Module):
|
||||||
|
"""
|
||||||
|
Any module where forward() takes timestep embeddings as a second argument.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def forward(self, x, emb):
|
||||||
|
"""
|
||||||
|
Apply the module to `x` given `emb` timestep embeddings.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
|
||||||
|
"""
|
||||||
|
A sequential module that passes timestep embeddings to the children that
|
||||||
|
support it as an extra input.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def forward(self, x, emb, context=None):
|
||||||
|
for layer in self:
|
||||||
|
if isinstance(layer, TimestepBlock):
|
||||||
|
x = layer(x, emb)
|
||||||
|
elif isinstance(layer, SpatialTransformer):
|
||||||
|
x = layer(x, context)
|
||||||
|
else:
|
||||||
|
x = layer(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class Upsample(nn.Module):
|
||||||
|
"""
|
||||||
|
An upsampling layer with an optional convolution.
|
||||||
|
:param channels: channels in the inputs and outputs.
|
||||||
|
:param use_conv: a bool determining if a convolution is applied.
|
||||||
|
:param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
|
||||||
|
upsampling occurs in the inner-two dimensions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
|
||||||
|
super().__init__()
|
||||||
|
self.channels = channels
|
||||||
|
self.out_channels = out_channels or channels
|
||||||
|
self.use_conv = use_conv
|
||||||
|
self.dims = dims
|
||||||
|
if use_conv:
|
||||||
|
self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
assert x.shape[1] == self.channels
|
||||||
|
if self.dims == 3:
|
||||||
|
x = F.interpolate(
|
||||||
|
x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
x = F.interpolate(x, scale_factor=2, mode="nearest")
|
||||||
|
if self.use_conv:
|
||||||
|
x = self.conv(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
class TransposedUpsample(nn.Module):
|
||||||
|
'Learned 2x upsampling without padding'
|
||||||
|
def __init__(self, channels, out_channels=None, ks=5):
|
||||||
|
super().__init__()
|
||||||
|
self.channels = channels
|
||||||
|
self.out_channels = out_channels or channels
|
||||||
|
|
||||||
|
self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2)
|
||||||
|
|
||||||
|
def forward(self,x):
|
||||||
|
return self.up(x)
|
||||||
|
|
||||||
|
|
||||||
|
class Downsample(nn.Module):
|
||||||
|
"""
|
||||||
|
A downsampling layer with an optional convolution.
|
||||||
|
:param channels: channels in the inputs and outputs.
|
||||||
|
:param use_conv: a bool determining if a convolution is applied.
|
||||||
|
:param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
|
||||||
|
downsampling occurs in the inner-two dimensions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
|
||||||
|
super().__init__()
|
||||||
|
self.channels = channels
|
||||||
|
self.out_channels = out_channels or channels
|
||||||
|
self.use_conv = use_conv
|
||||||
|
self.dims = dims
|
||||||
|
stride = 2 if dims != 3 else (1, 2, 2)
|
||||||
|
if use_conv:
|
||||||
|
self.op = conv_nd(
|
||||||
|
dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
assert self.channels == self.out_channels
|
||||||
|
self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
assert x.shape[1] == self.channels
|
||||||
|
return self.op(x)
|
||||||
|
|
||||||
|
|
||||||
|
class ResBlock(TimestepBlock):
|
||||||
|
"""
|
||||||
|
A residual block that can optionally change the number of channels.
|
||||||
|
:param channels: the number of input channels.
|
||||||
|
:param emb_channels: the number of timestep embedding channels.
|
||||||
|
:param dropout: the rate of dropout.
|
||||||
|
:param out_channels: if specified, the number of out channels.
|
||||||
|
:param use_conv: if True and out_channels is specified, use a spatial
|
||||||
|
convolution instead of a smaller 1x1 convolution to change the
|
||||||
|
channels in the skip connection.
|
||||||
|
:param dims: determines if the signal is 1D, 2D, or 3D.
|
||||||
|
:param use_checkpoint: if True, use gradient checkpointing on this module.
|
||||||
|
:param up: if True, use this block for upsampling.
|
||||||
|
:param down: if True, use this block for downsampling.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
channels,
|
||||||
|
emb_channels,
|
||||||
|
dropout,
|
||||||
|
out_channels=None,
|
||||||
|
use_conv=False,
|
||||||
|
use_scale_shift_norm=False,
|
||||||
|
dims=2,
|
||||||
|
use_checkpoint=False,
|
||||||
|
up=False,
|
||||||
|
down=False,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.channels = channels
|
||||||
|
self.emb_channels = emb_channels
|
||||||
|
self.dropout = dropout
|
||||||
|
self.out_channels = out_channels or channels
|
||||||
|
self.use_conv = use_conv
|
||||||
|
self.use_checkpoint = use_checkpoint
|
||||||
|
self.use_scale_shift_norm = use_scale_shift_norm
|
||||||
|
|
||||||
|
self.in_layers = nn.Sequential(
|
||||||
|
normalization(channels),
|
||||||
|
nn.SiLU(),
|
||||||
|
conv_nd(dims, channels, self.out_channels, 3, padding=1),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.updown = up or down
|
||||||
|
|
||||||
|
if up:
|
||||||
|
self.h_upd = Upsample(channels, False, dims)
|
||||||
|
self.x_upd = Upsample(channels, False, dims)
|
||||||
|
elif down:
|
||||||
|
self.h_upd = Downsample(channels, False, dims)
|
||||||
|
self.x_upd = Downsample(channels, False, dims)
|
||||||
|
else:
|
||||||
|
self.h_upd = self.x_upd = nn.Identity()
|
||||||
|
|
||||||
|
self.emb_layers = nn.Sequential(
|
||||||
|
nn.SiLU(),
|
||||||
|
linear(
|
||||||
|
emb_channels,
|
||||||
|
2 * self.out_channels if use_scale_shift_norm else self.out_channels,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
self.out_layers = nn.Sequential(
|
||||||
|
normalization(self.out_channels),
|
||||||
|
nn.SiLU(),
|
||||||
|
nn.Dropout(p=dropout),
|
||||||
|
zero_module(
|
||||||
|
conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.out_channels == channels:
|
||||||
|
self.skip_connection = nn.Identity()
|
||||||
|
elif use_conv:
|
||||||
|
self.skip_connection = conv_nd(
|
||||||
|
dims, channels, self.out_channels, 3, padding=1
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
|
||||||
|
|
||||||
|
def forward(self, x, emb):
|
||||||
|
"""
|
||||||
|
Apply the block to a Tensor, conditioned on a timestep embedding.
|
||||||
|
:param x: an [N x C x ...] Tensor of features.
|
||||||
|
:param emb: an [N x emb_channels] Tensor of timestep embeddings.
|
||||||
|
:return: an [N x C x ...] Tensor of outputs.
|
||||||
|
"""
|
||||||
|
return checkpoint(
|
||||||
|
self._forward, (x, emb), self.parameters(), self.use_checkpoint
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _forward(self, x, emb):
|
||||||
|
if self.updown:
|
||||||
|
in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
|
||||||
|
h = in_rest(x)
|
||||||
|
h = self.h_upd(h)
|
||||||
|
x = self.x_upd(x)
|
||||||
|
h = in_conv(h)
|
||||||
|
else:
|
||||||
|
h = self.in_layers(x)
|
||||||
|
emb_out = self.emb_layers(emb).type(h.dtype)
|
||||||
|
while len(emb_out.shape) < len(h.shape):
|
||||||
|
emb_out = emb_out[..., None]
|
||||||
|
if self.use_scale_shift_norm:
|
||||||
|
out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
|
||||||
|
scale, shift = th.chunk(emb_out, 2, dim=1)
|
||||||
|
h = out_norm(h) * (1 + scale) + shift
|
||||||
|
h = out_rest(h)
|
||||||
|
else:
|
||||||
|
h = h + emb_out
|
||||||
|
h = self.out_layers(h)
|
||||||
|
return self.skip_connection(x) + h
|
||||||
|
|
||||||
|
|
||||||
|
class AttentionBlock(nn.Module):
|
||||||
|
"""
|
||||||
|
An attention block that allows spatial positions to attend to each other.
|
||||||
|
Originally ported from here, but adapted to the N-d case.
|
||||||
|
https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
channels,
|
||||||
|
num_heads=1,
|
||||||
|
num_head_channels=-1,
|
||||||
|
use_checkpoint=False,
|
||||||
|
use_new_attention_order=False,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.channels = channels
|
||||||
|
if num_head_channels == -1:
|
||||||
|
self.num_heads = num_heads
|
||||||
|
else:
|
||||||
|
assert (
|
||||||
|
channels % num_head_channels == 0
|
||||||
|
), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
|
||||||
|
self.num_heads = channels // num_head_channels
|
||||||
|
self.use_checkpoint = use_checkpoint
|
||||||
|
self.norm = normalization(channels)
|
||||||
|
self.qkv = conv_nd(1, channels, channels * 3, 1)
|
||||||
|
if use_new_attention_order:
|
||||||
|
# split qkv before split heads
|
||||||
|
self.attention = QKVAttention(self.num_heads)
|
||||||
|
else:
|
||||||
|
# split heads before split qkv
|
||||||
|
self.attention = QKVAttentionLegacy(self.num_heads)
|
||||||
|
|
||||||
|
self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return checkpoint(self._forward, (x,), self.parameters(), True) # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
|
||||||
|
#return pt_checkpoint(self._forward, x) # pytorch
|
||||||
|
|
||||||
|
def _forward(self, x):
|
||||||
|
b, c, *spatial = x.shape
|
||||||
|
x = x.reshape(b, c, -1)
|
||||||
|
qkv = self.qkv(self.norm(x))
|
||||||
|
h = self.attention(qkv)
|
||||||
|
h = self.proj_out(h)
|
||||||
|
return (x + h).reshape(b, c, *spatial)
|
||||||
|
|
||||||
|
|
||||||
|
def count_flops_attn(model, _x, y):
|
||||||
|
"""
|
||||||
|
A counter for the `thop` package to count the operations in an
|
||||||
|
attention operation.
|
||||||
|
Meant to be used like:
|
||||||
|
macs, params = thop.profile(
|
||||||
|
model,
|
||||||
|
inputs=(inputs, timestamps),
|
||||||
|
custom_ops={QKVAttention: QKVAttention.count_flops},
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
b, c, *spatial = y[0].shape
|
||||||
|
num_spatial = int(np.prod(spatial))
|
||||||
|
# We perform two matmuls with the same number of ops.
|
||||||
|
# The first computes the weight matrix, the second computes
|
||||||
|
# the combination of the value vectors.
|
||||||
|
matmul_ops = 2 * b * (num_spatial ** 2) * c
|
||||||
|
model.total_ops += th.DoubleTensor([matmul_ops])
|
||||||
|
|
||||||
|
|
||||||
|
class QKVAttentionLegacy(nn.Module):
|
||||||
|
"""
|
||||||
|
A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, n_heads):
|
||||||
|
super().__init__()
|
||||||
|
self.n_heads = n_heads
|
||||||
|
|
||||||
|
def forward(self, qkv):
|
||||||
|
"""
|
||||||
|
Apply QKV attention.
|
||||||
|
:param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
|
||||||
|
:return: an [N x (H * C) x T] tensor after attention.
|
||||||
|
"""
|
||||||
|
bs, width, length = qkv.shape
|
||||||
|
assert width % (3 * self.n_heads) == 0
|
||||||
|
ch = width // (3 * self.n_heads)
|
||||||
|
q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
|
||||||
|
scale = 1 / math.sqrt(math.sqrt(ch))
|
||||||
|
weight = th.einsum(
|
||||||
|
"bct,bcs->bts", q * scale, k * scale
|
||||||
|
) # More stable with f16 than dividing afterwards
|
||||||
|
weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
|
||||||
|
a = th.einsum("bts,bcs->bct", weight, v)
|
||||||
|
return a.reshape(bs, -1, length)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def count_flops(model, _x, y):
|
||||||
|
return count_flops_attn(model, _x, y)
|
||||||
|
|
||||||
|
|
||||||
|
class QKVAttention(nn.Module):
|
||||||
|
"""
|
||||||
|
A module which performs QKV attention and splits in a different order.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, n_heads):
|
||||||
|
super().__init__()
|
||||||
|
self.n_heads = n_heads
|
||||||
|
|
||||||
|
def forward(self, qkv):
|
||||||
|
"""
|
||||||
|
Apply QKV attention.
|
||||||
|
:param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
|
||||||
|
:return: an [N x (H * C) x T] tensor after attention.
|
||||||
|
"""
|
||||||
|
bs, width, length = qkv.shape
|
||||||
|
assert width % (3 * self.n_heads) == 0
|
||||||
|
ch = width // (3 * self.n_heads)
|
||||||
|
q, k, v = qkv.chunk(3, dim=1)
|
||||||
|
scale = 1 / math.sqrt(math.sqrt(ch))
|
||||||
|
weight = th.einsum(
|
||||||
|
"bct,bcs->bts",
|
||||||
|
(q * scale).view(bs * self.n_heads, ch, length),
|
||||||
|
(k * scale).view(bs * self.n_heads, ch, length),
|
||||||
|
) # More stable with f16 than dividing afterwards
|
||||||
|
weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
|
||||||
|
a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
|
||||||
|
return a.reshape(bs, -1, length)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def count_flops(model, _x, y):
|
||||||
|
return count_flops_attn(model, _x, y)
|
||||||
|
|
||||||
|
|
||||||
|
class UNetModel(nn.Module):
|
||||||
|
"""
|
||||||
|
The full UNet model with attention and timestep embedding.
|
||||||
|
:param in_channels: channels in the input Tensor.
|
||||||
|
:param model_channels: base channel count for the model.
|
||||||
|
:param out_channels: channels in the output Tensor.
|
||||||
|
:param num_res_blocks: number of residual blocks per downsample.
|
||||||
|
:param attention_resolutions: a collection of downsample rates at which
|
||||||
|
attention will take place. May be a set, list, or tuple.
|
||||||
|
For example, if this contains 4, then at 4x downsampling, attention
|
||||||
|
will be used.
|
||||||
|
:param dropout: the dropout probability.
|
||||||
|
:param channel_mult: channel multiplier for each level of the UNet.
|
||||||
|
:param conv_resample: if True, use learned convolutions for upsampling and
|
||||||
|
downsampling.
|
||||||
|
:param dims: determines if the signal is 1D, 2D, or 3D.
|
||||||
|
:param num_classes: if specified (as an int), then this model will be
|
||||||
|
class-conditional with `num_classes` classes.
|
||||||
|
:param use_checkpoint: use gradient checkpointing to reduce memory usage.
|
||||||
|
:param num_heads: the number of attention heads in each attention layer.
|
||||||
|
:param num_heads_channels: if specified, ignore num_heads and instead use
|
||||||
|
a fixed channel width per attention head.
|
||||||
|
:param num_heads_upsample: works with num_heads to set a different number
|
||||||
|
of heads for upsampling. Deprecated.
|
||||||
|
:param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
|
||||||
|
:param resblock_updown: use residual blocks for up/downsampling.
|
||||||
|
:param use_new_attention_order: use a different attention pattern for potentially
|
||||||
|
increased efficiency.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
image_size,
|
||||||
|
in_channels,
|
||||||
|
model_channels,
|
||||||
|
out_channels,
|
||||||
|
num_res_blocks,
|
||||||
|
attention_resolutions,
|
||||||
|
dropout=0,
|
||||||
|
channel_mult=(1, 2, 4, 8),
|
||||||
|
conv_resample=True,
|
||||||
|
dims=2,
|
||||||
|
num_classes=None,
|
||||||
|
use_checkpoint=False,
|
||||||
|
use_fp16=False,
|
||||||
|
num_heads=-1,
|
||||||
|
num_head_channels=-1,
|
||||||
|
num_heads_upsample=-1,
|
||||||
|
use_scale_shift_norm=False,
|
||||||
|
resblock_updown=False,
|
||||||
|
use_new_attention_order=False,
|
||||||
|
use_spatial_transformer=False, # custom transformer support
|
||||||
|
transformer_depth=1, # custom transformer support
|
||||||
|
context_dim=None, # custom transformer support
|
||||||
|
n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model
|
||||||
|
legacy=True,
|
||||||
|
disable_self_attentions=None,
|
||||||
|
num_attention_blocks=None,
|
||||||
|
disable_middle_self_attn=False,
|
||||||
|
use_linear_in_transformer=False,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
if use_spatial_transformer:
|
||||||
|
assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
|
||||||
|
|
||||||
|
if context_dim is not None:
|
||||||
|
assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
|
||||||
|
from omegaconf.listconfig import ListConfig
|
||||||
|
if type(context_dim) == ListConfig:
|
||||||
|
context_dim = list(context_dim)
|
||||||
|
|
||||||
|
if num_heads_upsample == -1:
|
||||||
|
num_heads_upsample = num_heads
|
||||||
|
|
||||||
|
if num_heads == -1:
|
||||||
|
assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
|
||||||
|
|
||||||
|
if num_head_channels == -1:
|
||||||
|
assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
|
||||||
|
|
||||||
|
self.image_size = image_size
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.model_channels = model_channels
|
||||||
|
self.out_channels = out_channels
|
||||||
|
if isinstance(num_res_blocks, int):
|
||||||
|
self.num_res_blocks = len(channel_mult) * [num_res_blocks]
|
||||||
|
else:
|
||||||
|
if len(num_res_blocks) != len(channel_mult):
|
||||||
|
raise ValueError("provide num_res_blocks either as an int (globally constant) or "
|
||||||
|
"as a list/tuple (per-level) with the same length as channel_mult")
|
||||||
|
self.num_res_blocks = num_res_blocks
|
||||||
|
if disable_self_attentions is not None:
|
||||||
|
# should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
|
||||||
|
assert len(disable_self_attentions) == len(channel_mult)
|
||||||
|
if num_attention_blocks is not None:
|
||||||
|
assert len(num_attention_blocks) == len(self.num_res_blocks)
|
||||||
|
assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
|
||||||
|
print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
|
||||||
|
f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
|
||||||
|
f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
|
||||||
|
f"attention will still not be set.")
|
||||||
|
self.use_fp16 = use_fp16
|
||||||
|
self.attention_resolutions = attention_resolutions
|
||||||
|
self.dropout = dropout
|
||||||
|
self.channel_mult = channel_mult
|
||||||
|
self.conv_resample = conv_resample
|
||||||
|
self.num_classes = num_classes
|
||||||
|
self.use_checkpoint = use_checkpoint
|
||||||
|
self.dtype = th.float16 if use_fp16 else th.float32
|
||||||
|
self.num_heads = num_heads
|
||||||
|
self.num_head_channels = num_head_channels
|
||||||
|
self.num_heads_upsample = num_heads_upsample
|
||||||
|
self.predict_codebook_ids = n_embed is not None
|
||||||
|
|
||||||
|
time_embed_dim = model_channels * 4
|
||||||
|
self.time_embed = nn.Sequential(
|
||||||
|
linear(model_channels, time_embed_dim),
|
||||||
|
nn.SiLU(),
|
||||||
|
linear(time_embed_dim, time_embed_dim),
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.num_classes is not None:
|
||||||
|
if isinstance(self.num_classes, int):
|
||||||
|
self.label_emb = nn.Embedding(num_classes, time_embed_dim)
|
||||||
|
elif self.num_classes == "continuous":
|
||||||
|
print("setting up linear c_adm embedding layer")
|
||||||
|
self.label_emb = nn.Linear(1, time_embed_dim)
|
||||||
|
else:
|
||||||
|
raise ValueError()
|
||||||
|
|
||||||
|
self.input_blocks = nn.ModuleList(
|
||||||
|
[
|
||||||
|
TimestepEmbedSequential(
|
||||||
|
conv_nd(dims, in_channels, model_channels, 3, padding=1)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self._feature_size = model_channels
|
||||||
|
input_block_chans = [model_channels]
|
||||||
|
ch = model_channels
|
||||||
|
ds = 1
|
||||||
|
for level, mult in enumerate(channel_mult):
|
||||||
|
for nr in range(self.num_res_blocks[level]):
|
||||||
|
layers = [
|
||||||
|
ResBlock(
|
||||||
|
ch,
|
||||||
|
time_embed_dim,
|
||||||
|
dropout,
|
||||||
|
out_channels=mult * model_channels,
|
||||||
|
dims=dims,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
use_scale_shift_norm=use_scale_shift_norm,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
ch = mult * model_channels
|
||||||
|
if ds in attention_resolutions:
|
||||||
|
if num_head_channels == -1:
|
||||||
|
dim_head = ch // num_heads
|
||||||
|
else:
|
||||||
|
num_heads = ch // num_head_channels
|
||||||
|
dim_head = num_head_channels
|
||||||
|
if legacy:
|
||||||
|
#num_heads = 1
|
||||||
|
dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
|
||||||
|
if exists(disable_self_attentions):
|
||||||
|
disabled_sa = disable_self_attentions[level]
|
||||||
|
else:
|
||||||
|
disabled_sa = False
|
||||||
|
|
||||||
|
if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
|
||||||
|
layers.append(
|
||||||
|
AttentionBlock(
|
||||||
|
ch,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
num_heads=num_heads,
|
||||||
|
num_head_channels=dim_head,
|
||||||
|
use_new_attention_order=use_new_attention_order,
|
||||||
|
) if not use_spatial_transformer else SpatialTransformer(
|
||||||
|
ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
|
||||||
|
disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
|
||||||
|
use_checkpoint=use_checkpoint
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.input_blocks.append(TimestepEmbedSequential(*layers))
|
||||||
|
self._feature_size += ch
|
||||||
|
input_block_chans.append(ch)
|
||||||
|
if level != len(channel_mult) - 1:
|
||||||
|
out_ch = ch
|
||||||
|
self.input_blocks.append(
|
||||||
|
TimestepEmbedSequential(
|
||||||
|
ResBlock(
|
||||||
|
ch,
|
||||||
|
time_embed_dim,
|
||||||
|
dropout,
|
||||||
|
out_channels=out_ch,
|
||||||
|
dims=dims,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
use_scale_shift_norm=use_scale_shift_norm,
|
||||||
|
down=True,
|
||||||
|
)
|
||||||
|
if resblock_updown
|
||||||
|
else Downsample(
|
||||||
|
ch, conv_resample, dims=dims, out_channels=out_ch
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
ch = out_ch
|
||||||
|
input_block_chans.append(ch)
|
||||||
|
ds *= 2
|
||||||
|
self._feature_size += ch
|
||||||
|
|
||||||
|
if num_head_channels == -1:
|
||||||
|
dim_head = ch // num_heads
|
||||||
|
else:
|
||||||
|
num_heads = ch // num_head_channels
|
||||||
|
dim_head = num_head_channels
|
||||||
|
if legacy:
|
||||||
|
#num_heads = 1
|
||||||
|
dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
|
||||||
|
self.middle_block = TimestepEmbedSequential(
|
||||||
|
ResBlock(
|
||||||
|
ch,
|
||||||
|
time_embed_dim,
|
||||||
|
dropout,
|
||||||
|
dims=dims,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
use_scale_shift_norm=use_scale_shift_norm,
|
||||||
|
),
|
||||||
|
AttentionBlock(
|
||||||
|
ch,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
num_heads=num_heads,
|
||||||
|
num_head_channels=dim_head,
|
||||||
|
use_new_attention_order=use_new_attention_order,
|
||||||
|
) if not use_spatial_transformer else SpatialTransformer( # always uses a self-attn
|
||||||
|
ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
|
||||||
|
disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
|
||||||
|
use_checkpoint=use_checkpoint
|
||||||
|
),
|
||||||
|
ResBlock(
|
||||||
|
ch,
|
||||||
|
time_embed_dim,
|
||||||
|
dropout,
|
||||||
|
dims=dims,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
use_scale_shift_norm=use_scale_shift_norm,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
self._feature_size += ch
|
||||||
|
|
||||||
|
self.output_blocks = nn.ModuleList([])
|
||||||
|
for level, mult in list(enumerate(channel_mult))[::-1]:
|
||||||
|
for i in range(self.num_res_blocks[level] + 1):
|
||||||
|
ich = input_block_chans.pop()
|
||||||
|
layers = [
|
||||||
|
ResBlock(
|
||||||
|
ch + ich,
|
||||||
|
time_embed_dim,
|
||||||
|
dropout,
|
||||||
|
out_channels=model_channels * mult,
|
||||||
|
dims=dims,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
use_scale_shift_norm=use_scale_shift_norm,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
ch = model_channels * mult
|
||||||
|
if ds in attention_resolutions:
|
||||||
|
if num_head_channels == -1:
|
||||||
|
dim_head = ch // num_heads
|
||||||
|
else:
|
||||||
|
num_heads = ch // num_head_channels
|
||||||
|
dim_head = num_head_channels
|
||||||
|
if legacy:
|
||||||
|
#num_heads = 1
|
||||||
|
dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
|
||||||
|
if exists(disable_self_attentions):
|
||||||
|
disabled_sa = disable_self_attentions[level]
|
||||||
|
else:
|
||||||
|
disabled_sa = False
|
||||||
|
|
||||||
|
if not exists(num_attention_blocks) or i < num_attention_blocks[level]:
|
||||||
|
layers.append(
|
||||||
|
AttentionBlock(
|
||||||
|
ch,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
num_heads=num_heads_upsample,
|
||||||
|
num_head_channels=dim_head,
|
||||||
|
use_new_attention_order=use_new_attention_order,
|
||||||
|
) if not use_spatial_transformer else SpatialTransformer(
|
||||||
|
ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
|
||||||
|
disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
|
||||||
|
use_checkpoint=use_checkpoint
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if level and i == self.num_res_blocks[level]:
|
||||||
|
out_ch = ch
|
||||||
|
layers.append(
|
||||||
|
ResBlock(
|
||||||
|
ch,
|
||||||
|
time_embed_dim,
|
||||||
|
dropout,
|
||||||
|
out_channels=out_ch,
|
||||||
|
dims=dims,
|
||||||
|
use_checkpoint=use_checkpoint,
|
||||||
|
use_scale_shift_norm=use_scale_shift_norm,
|
||||||
|
up=True,
|
||||||
|
)
|
||||||
|
if resblock_updown
|
||||||
|
else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
|
||||||
|
)
|
||||||
|
ds //= 2
|
||||||
|
self.output_blocks.append(TimestepEmbedSequential(*layers))
|
||||||
|
self._feature_size += ch
|
||||||
|
|
||||||
|
self.out = nn.Sequential(
|
||||||
|
normalization(ch),
|
||||||
|
nn.SiLU(),
|
||||||
|
zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
|
||||||
|
)
|
||||||
|
if self.predict_codebook_ids:
|
||||||
|
self.id_predictor = nn.Sequential(
|
||||||
|
normalization(ch),
|
||||||
|
conv_nd(dims, model_channels, n_embed, 1),
|
||||||
|
#nn.LogSoftmax(dim=1) # change to cross_entropy and produce non-normalized logits
|
||||||
|
)
|
||||||
|
|
||||||
|
def convert_to_fp16(self):
|
||||||
|
"""
|
||||||
|
Convert the torso of the model to float16.
|
||||||
|
"""
|
||||||
|
self.input_blocks.apply(convert_module_to_f16)
|
||||||
|
self.middle_block.apply(convert_module_to_f16)
|
||||||
|
self.output_blocks.apply(convert_module_to_f16)
|
||||||
|
|
||||||
|
def convert_to_fp32(self):
|
||||||
|
"""
|
||||||
|
Convert the torso of the model to float32.
|
||||||
|
"""
|
||||||
|
self.input_blocks.apply(convert_module_to_f32)
|
||||||
|
self.middle_block.apply(convert_module_to_f32)
|
||||||
|
self.output_blocks.apply(convert_module_to_f32)
|
||||||
|
|
||||||
|
def forward(self, x, timesteps=None, context=None, y=None,**kwargs):
|
||||||
|
"""
|
||||||
|
Apply the model to an input batch.
|
||||||
|
:param x: an [N x C x ...] Tensor of inputs.
|
||||||
|
:param timesteps: a 1-D batch of timesteps.
|
||||||
|
:param context: conditioning plugged in via crossattn
|
||||||
|
:param y: an [N] Tensor of labels, if class-conditional.
|
||||||
|
:return: an [N x C x ...] Tensor of outputs.
|
||||||
|
"""
|
||||||
|
assert (y is not None) == (
|
||||||
|
self.num_classes is not None
|
||||||
|
), "must specify y if and only if the model is class-conditional"
|
||||||
|
hs = []
|
||||||
|
t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
|
||||||
|
emb = self.time_embed(t_emb)
|
||||||
|
|
||||||
|
if self.num_classes is not None:
|
||||||
|
assert y.shape[0] == x.shape[0]
|
||||||
|
emb = emb + self.label_emb(y)
|
||||||
|
|
||||||
|
h = x.type(self.dtype)
|
||||||
|
for module in self.input_blocks:
|
||||||
|
h = module(h, emb, context)
|
||||||
|
hs.append(h)
|
||||||
|
h = self.middle_block(h, emb, context)
|
||||||
|
for module in self.output_blocks:
|
||||||
|
h = th.cat([h, hs.pop()], dim=1)
|
||||||
|
h = module(h, emb, context)
|
||||||
|
h = h.type(x.dtype)
|
||||||
|
if self.predict_codebook_ids:
|
||||||
|
return self.id_predictor(h)
|
||||||
|
else:
|
||||||
|
return self.out(h)
|
@ -0,0 +1,81 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import numpy as np
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
from .util import extract_into_tensor, make_beta_schedule
|
||||||
|
from ...util import default
|
||||||
|
|
||||||
|
|
||||||
|
class AbstractLowScaleModel(nn.Module):
|
||||||
|
# for concatenating a downsampled image to the latent representation
|
||||||
|
def __init__(self, noise_schedule_config=None):
|
||||||
|
super(AbstractLowScaleModel, self).__init__()
|
||||||
|
if noise_schedule_config is not None:
|
||||||
|
self.register_schedule(**noise_schedule_config)
|
||||||
|
|
||||||
|
def register_schedule(self, beta_schedule="linear", timesteps=1000,
|
||||||
|
linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
|
||||||
|
betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
|
||||||
|
cosine_s=cosine_s)
|
||||||
|
alphas = 1. - betas
|
||||||
|
alphas_cumprod = np.cumprod(alphas, axis=0)
|
||||||
|
alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
|
||||||
|
|
||||||
|
timesteps, = betas.shape
|
||||||
|
self.num_timesteps = int(timesteps)
|
||||||
|
self.linear_start = linear_start
|
||||||
|
self.linear_end = linear_end
|
||||||
|
assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
|
||||||
|
|
||||||
|
to_torch = partial(torch.tensor, dtype=torch.float32)
|
||||||
|
|
||||||
|
self.register_buffer('betas', to_torch(betas))
|
||||||
|
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
|
||||||
|
self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
|
||||||
|
|
||||||
|
# calculations for diffusion q(x_t | x_{t-1}) and others
|
||||||
|
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
|
||||||
|
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
|
||||||
|
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
|
||||||
|
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
|
||||||
|
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
|
||||||
|
|
||||||
|
def q_sample(self, x_start, t, noise=None):
|
||||||
|
noise = default(noise, lambda: torch.randn_like(x_start))
|
||||||
|
return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
|
||||||
|
extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return x, None
|
||||||
|
|
||||||
|
def decode(self, x):
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleImageConcat(AbstractLowScaleModel):
|
||||||
|
# no noise level conditioning
|
||||||
|
def __init__(self):
|
||||||
|
super(SimpleImageConcat, self).__init__(noise_schedule_config=None)
|
||||||
|
self.max_noise_level = 0
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# fix to constant noise level
|
||||||
|
return x, torch.zeros(x.shape[0], device=x.device).long()
|
||||||
|
|
||||||
|
|
||||||
|
class ImageConcatWithNoiseAugmentation(AbstractLowScaleModel):
|
||||||
|
def __init__(self, noise_schedule_config, max_noise_level=1000, to_cuda=False):
|
||||||
|
super().__init__(noise_schedule_config=noise_schedule_config)
|
||||||
|
self.max_noise_level = max_noise_level
|
||||||
|
|
||||||
|
def forward(self, x, noise_level=None):
|
||||||
|
if noise_level is None:
|
||||||
|
noise_level = torch.randint(0, self.max_noise_level, (x.shape[0],), device=x.device).long()
|
||||||
|
else:
|
||||||
|
assert isinstance(noise_level, torch.Tensor)
|
||||||
|
z = self.q_sample(x, noise_level)
|
||||||
|
return z, noise_level
|
||||||
|
|
||||||
|
|
||||||
|
|
271
AnyText/AnyText_scripts/ldm/modules/diffusionmodules/util.py
Normal file
@ -0,0 +1,271 @@
|
|||||||
|
# adopted from
|
||||||
|
# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
|
||||||
|
# and
|
||||||
|
# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
|
||||||
|
# and
|
||||||
|
# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
|
||||||
|
#
|
||||||
|
# thanks!
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
import math
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import numpy as np
|
||||||
|
from einops import repeat
|
||||||
|
|
||||||
|
from ...util import instantiate_from_config
|
||||||
|
|
||||||
|
|
||||||
|
def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
|
||||||
|
if schedule == "linear":
|
||||||
|
betas = (
|
||||||
|
torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
|
||||||
|
)
|
||||||
|
|
||||||
|
elif schedule == "cosine":
|
||||||
|
timesteps = (
|
||||||
|
torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
|
||||||
|
)
|
||||||
|
alphas = timesteps / (1 + cosine_s) * np.pi / 2
|
||||||
|
alphas = torch.cos(alphas).pow(2)
|
||||||
|
alphas = alphas / alphas[0]
|
||||||
|
betas = 1 - alphas[1:] / alphas[:-1]
|
||||||
|
betas = np.clip(betas, a_min=0, a_max=0.999)
|
||||||
|
|
||||||
|
elif schedule == "sqrt_linear":
|
||||||
|
betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
|
||||||
|
elif schedule == "sqrt":
|
||||||
|
betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
|
||||||
|
else:
|
||||||
|
raise ValueError(f"schedule '{schedule}' unknown.")
|
||||||
|
return betas.numpy()
|
||||||
|
|
||||||
|
|
||||||
|
def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
|
||||||
|
if ddim_discr_method == 'uniform':
|
||||||
|
c = num_ddpm_timesteps // num_ddim_timesteps
|
||||||
|
ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
|
||||||
|
elif ddim_discr_method == 'quad':
|
||||||
|
ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
|
||||||
|
|
||||||
|
# assert ddim_timesteps.shape[0] == num_ddim_timesteps
|
||||||
|
# add one to get the final alpha values right (the ones from first scale to data during sampling)
|
||||||
|
steps_out = ddim_timesteps + 1
|
||||||
|
if verbose:
|
||||||
|
print(f'Selected timesteps for ddim sampler: {steps_out}')
|
||||||
|
return steps_out
|
||||||
|
|
||||||
|
|
||||||
|
def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
|
||||||
|
# select alphas for computing the variance schedule
|
||||||
|
alphas = alphacums[ddim_timesteps]
|
||||||
|
alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
|
||||||
|
|
||||||
|
# according the the formula provided in https://arxiv.org/abs/2010.02502
|
||||||
|
sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
|
||||||
|
if verbose:
|
||||||
|
print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
|
||||||
|
print(f'For the chosen value of eta, which is {eta}, '
|
||||||
|
f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
|
||||||
|
return sigmas, alphas, alphas_prev
|
||||||
|
|
||||||
|
|
||||||
|
def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
|
||||||
|
"""
|
||||||
|
Create a beta schedule that discretizes the given alpha_t_bar function,
|
||||||
|
which defines the cumulative product of (1-beta) over time from t = [0,1].
|
||||||
|
:param num_diffusion_timesteps: the number of betas to produce.
|
||||||
|
:param alpha_bar: a lambda that takes an argument t from 0 to 1 and
|
||||||
|
produces the cumulative product of (1-beta) up to that
|
||||||
|
part of the diffusion process.
|
||||||
|
:param max_beta: the maximum beta to use; use values lower than 1 to
|
||||||
|
prevent singularities.
|
||||||
|
"""
|
||||||
|
betas = []
|
||||||
|
for i in range(num_diffusion_timesteps):
|
||||||
|
t1 = i / num_diffusion_timesteps
|
||||||
|
t2 = (i + 1) / num_diffusion_timesteps
|
||||||
|
betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
|
||||||
|
return np.array(betas)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_into_tensor(a, t, x_shape):
|
||||||
|
b, *_ = t.shape
|
||||||
|
out = a.gather(-1, t)
|
||||||
|
return out.reshape(b, *((1,) * (len(x_shape) - 1)))
|
||||||
|
|
||||||
|
|
||||||
|
def checkpoint(func, inputs, params, flag):
|
||||||
|
"""
|
||||||
|
Evaluate a function without caching intermediate activations, allowing for
|
||||||
|
reduced memory at the expense of extra compute in the backward pass.
|
||||||
|
:param func: the function to evaluate.
|
||||||
|
:param inputs: the argument sequence to pass to `func`.
|
||||||
|
:param params: a sequence of parameters `func` depends on but does not
|
||||||
|
explicitly take as arguments.
|
||||||
|
:param flag: if False, disable gradient checkpointing.
|
||||||
|
"""
|
||||||
|
if flag:
|
||||||
|
args = tuple(inputs) + tuple(params)
|
||||||
|
return CheckpointFunction.apply(func, len(inputs), *args)
|
||||||
|
else:
|
||||||
|
return func(*inputs)
|
||||||
|
|
||||||
|
|
||||||
|
class CheckpointFunction(torch.autograd.Function):
|
||||||
|
@staticmethod
|
||||||
|
def forward(ctx, run_function, length, *args):
|
||||||
|
ctx.run_function = run_function
|
||||||
|
ctx.input_tensors = list(args[:length])
|
||||||
|
ctx.input_params = list(args[length:])
|
||||||
|
ctx.gpu_autocast_kwargs = {"enabled": torch.is_autocast_enabled(),
|
||||||
|
"dtype": torch.get_autocast_gpu_dtype(),
|
||||||
|
"cache_enabled": torch.is_autocast_cache_enabled()}
|
||||||
|
with torch.no_grad():
|
||||||
|
output_tensors = ctx.run_function(*ctx.input_tensors)
|
||||||
|
return output_tensors
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def backward(ctx, *output_grads):
|
||||||
|
ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
|
||||||
|
with torch.enable_grad(), \
|
||||||
|
torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs):
|
||||||
|
# Fixes a bug where the first op in run_function modifies the
|
||||||
|
# Tensor storage in place, which is not allowed for detach()'d
|
||||||
|
# Tensors.
|
||||||
|
shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
|
||||||
|
output_tensors = ctx.run_function(*shallow_copies)
|
||||||
|
input_grads = torch.autograd.grad(
|
||||||
|
output_tensors,
|
||||||
|
ctx.input_tensors + ctx.input_params,
|
||||||
|
output_grads,
|
||||||
|
allow_unused=True,
|
||||||
|
)
|
||||||
|
del ctx.input_tensors
|
||||||
|
del ctx.input_params
|
||||||
|
del output_tensors
|
||||||
|
return (None, None) + input_grads
|
||||||
|
|
||||||
|
|
||||||
|
def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
|
||||||
|
"""
|
||||||
|
Create sinusoidal timestep embeddings.
|
||||||
|
:param timesteps: a 1-D Tensor of N indices, one per batch element.
|
||||||
|
These may be fractional.
|
||||||
|
:param dim: the dimension of the output.
|
||||||
|
:param max_period: controls the minimum frequency of the embeddings.
|
||||||
|
:return: an [N x dim] Tensor of positional embeddings.
|
||||||
|
"""
|
||||||
|
if not repeat_only:
|
||||||
|
half = dim // 2
|
||||||
|
freqs = torch.exp(
|
||||||
|
-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
|
||||||
|
).to(device=timesteps.device)
|
||||||
|
args = timesteps[:, None].float() * freqs[None]
|
||||||
|
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
||||||
|
if dim % 2:
|
||||||
|
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
||||||
|
else:
|
||||||
|
embedding = repeat(timesteps, 'b -> b d', d=dim)
|
||||||
|
return embedding
|
||||||
|
|
||||||
|
|
||||||
|
def zero_module(module):
|
||||||
|
"""
|
||||||
|
Zero out the parameters of a module and return it.
|
||||||
|
"""
|
||||||
|
for p in module.parameters():
|
||||||
|
p.detach().zero_()
|
||||||
|
return module
|
||||||
|
|
||||||
|
|
||||||
|
def scale_module(module, scale):
|
||||||
|
"""
|
||||||
|
Scale the parameters of a module and return it.
|
||||||
|
"""
|
||||||
|
for p in module.parameters():
|
||||||
|
p.detach().mul_(scale)
|
||||||
|
return module
|
||||||
|
|
||||||
|
|
||||||
|
def mean_flat(tensor):
|
||||||
|
"""
|
||||||
|
Take the mean over all non-batch dimensions.
|
||||||
|
"""
|
||||||
|
return tensor.mean(dim=list(range(1, len(tensor.shape))))
|
||||||
|
|
||||||
|
|
||||||
|
def normalization(channels):
|
||||||
|
"""
|
||||||
|
Make a standard normalization layer.
|
||||||
|
:param channels: number of input channels.
|
||||||
|
:return: an nn.Module for normalization.
|
||||||
|
"""
|
||||||
|
return GroupNorm32(32, channels)
|
||||||
|
|
||||||
|
|
||||||
|
# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
|
||||||
|
class SiLU(nn.Module):
|
||||||
|
def forward(self, x):
|
||||||
|
return x * torch.sigmoid(x)
|
||||||
|
|
||||||
|
|
||||||
|
class GroupNorm32(nn.GroupNorm):
|
||||||
|
def forward(self, x):
|
||||||
|
# return super().forward(x.float()).type(x.dtype)
|
||||||
|
return super().forward(x).type(x.dtype)
|
||||||
|
|
||||||
|
def conv_nd(dims, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Create a 1D, 2D, or 3D convolution module.
|
||||||
|
"""
|
||||||
|
if dims == 1:
|
||||||
|
return nn.Conv1d(*args, **kwargs)
|
||||||
|
elif dims == 2:
|
||||||
|
return nn.Conv2d(*args, **kwargs)
|
||||||
|
elif dims == 3:
|
||||||
|
return nn.Conv3d(*args, **kwargs)
|
||||||
|
raise ValueError(f"unsupported dimensions: {dims}")
|
||||||
|
|
||||||
|
|
||||||
|
def linear(*args, **kwargs):
|
||||||
|
"""
|
||||||
|
Create a linear module.
|
||||||
|
"""
|
||||||
|
return nn.Linear(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def avg_pool_nd(dims, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Create a 1D, 2D, or 3D average pooling module.
|
||||||
|
"""
|
||||||
|
if dims == 1:
|
||||||
|
return nn.AvgPool1d(*args, **kwargs)
|
||||||
|
elif dims == 2:
|
||||||
|
return nn.AvgPool2d(*args, **kwargs)
|
||||||
|
elif dims == 3:
|
||||||
|
return nn.AvgPool3d(*args, **kwargs)
|
||||||
|
raise ValueError(f"unsupported dimensions: {dims}")
|
||||||
|
|
||||||
|
|
||||||
|
class HybridConditioner(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, c_concat_config, c_crossattn_config):
|
||||||
|
super().__init__()
|
||||||
|
self.concat_conditioner = instantiate_from_config(c_concat_config)
|
||||||
|
self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
|
||||||
|
|
||||||
|
def forward(self, c_concat, c_crossattn):
|
||||||
|
c_concat = self.concat_conditioner(c_concat)
|
||||||
|
c_crossattn = self.crossattn_conditioner(c_crossattn)
|
||||||
|
return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
|
||||||
|
|
||||||
|
|
||||||
|
def noise_like(shape, device, repeat=False):
|
||||||
|
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
|
||||||
|
noise = lambda: torch.randn(shape, device=device)
|
||||||
|
return repeat_noise() if repeat else noise()
|
@ -0,0 +1,92 @@
|
|||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class AbstractDistribution:
|
||||||
|
def sample(self):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def mode(self):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
class DiracDistribution(AbstractDistribution):
|
||||||
|
def __init__(self, value):
|
||||||
|
self.value = value
|
||||||
|
|
||||||
|
def sample(self):
|
||||||
|
return self.value
|
||||||
|
|
||||||
|
def mode(self):
|
||||||
|
return self.value
|
||||||
|
|
||||||
|
|
||||||
|
class DiagonalGaussianDistribution(object):
|
||||||
|
def __init__(self, parameters, deterministic=False):
|
||||||
|
self.parameters = parameters
|
||||||
|
self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
|
||||||
|
self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
|
||||||
|
self.deterministic = deterministic
|
||||||
|
self.std = torch.exp(0.5 * self.logvar)
|
||||||
|
self.var = torch.exp(self.logvar)
|
||||||
|
if self.deterministic:
|
||||||
|
self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
|
||||||
|
|
||||||
|
def sample(self):
|
||||||
|
x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def kl(self, other=None):
|
||||||
|
if self.deterministic:
|
||||||
|
return torch.Tensor([0.])
|
||||||
|
else:
|
||||||
|
if other is None:
|
||||||
|
return 0.5 * torch.sum(torch.pow(self.mean, 2)
|
||||||
|
+ self.var - 1.0 - self.logvar,
|
||||||
|
dim=[1, 2, 3])
|
||||||
|
else:
|
||||||
|
return 0.5 * torch.sum(
|
||||||
|
torch.pow(self.mean - other.mean, 2) / other.var
|
||||||
|
+ self.var / other.var - 1.0 - self.logvar + other.logvar,
|
||||||
|
dim=[1, 2, 3])
|
||||||
|
|
||||||
|
def nll(self, sample, dims=[1,2,3]):
|
||||||
|
if self.deterministic:
|
||||||
|
return torch.Tensor([0.])
|
||||||
|
logtwopi = np.log(2.0 * np.pi)
|
||||||
|
return 0.5 * torch.sum(
|
||||||
|
logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
|
||||||
|
dim=dims)
|
||||||
|
|
||||||
|
def mode(self):
|
||||||
|
return self.mean
|
||||||
|
|
||||||
|
|
||||||
|
def normal_kl(mean1, logvar1, mean2, logvar2):
|
||||||
|
"""
|
||||||
|
source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
|
||||||
|
Compute the KL divergence between two gaussians.
|
||||||
|
Shapes are automatically broadcasted, so batches can be compared to
|
||||||
|
scalars, among other use cases.
|
||||||
|
"""
|
||||||
|
tensor = None
|
||||||
|
for obj in (mean1, logvar1, mean2, logvar2):
|
||||||
|
if isinstance(obj, torch.Tensor):
|
||||||
|
tensor = obj
|
||||||
|
break
|
||||||
|
assert tensor is not None, "at least one argument must be a Tensor"
|
||||||
|
|
||||||
|
# Force variances to be Tensors. Broadcasting helps convert scalars to
|
||||||
|
# Tensors, but it does not work for torch.exp().
|
||||||
|
logvar1, logvar2 = [
|
||||||
|
x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
|
||||||
|
for x in (logvar1, logvar2)
|
||||||
|
]
|
||||||
|
|
||||||
|
return 0.5 * (
|
||||||
|
-1.0
|
||||||
|
+ logvar2
|
||||||
|
- logvar1
|
||||||
|
+ torch.exp(logvar1 - logvar2)
|
||||||
|
+ ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
|
||||||
|
)
|
80
AnyText/AnyText_scripts/ldm/modules/ema.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
|
||||||
|
class LitEma(nn.Module):
|
||||||
|
def __init__(self, model, decay=0.9999, use_num_upates=True):
|
||||||
|
super().__init__()
|
||||||
|
if decay < 0.0 or decay > 1.0:
|
||||||
|
raise ValueError('Decay must be between 0 and 1')
|
||||||
|
|
||||||
|
self.m_name2s_name = {}
|
||||||
|
self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
|
||||||
|
self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int) if use_num_upates
|
||||||
|
else torch.tensor(-1, dtype=torch.int))
|
||||||
|
|
||||||
|
for name, p in model.named_parameters():
|
||||||
|
if p.requires_grad:
|
||||||
|
# remove as '.'-character is not allowed in buffers
|
||||||
|
s_name = name.replace('.', '')
|
||||||
|
self.m_name2s_name.update({name: s_name})
|
||||||
|
self.register_buffer(s_name, p.clone().detach().data)
|
||||||
|
|
||||||
|
self.collected_params = []
|
||||||
|
|
||||||
|
def reset_num_updates(self):
|
||||||
|
del self.num_updates
|
||||||
|
self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int))
|
||||||
|
|
||||||
|
def forward(self, model):
|
||||||
|
decay = self.decay
|
||||||
|
|
||||||
|
if self.num_updates >= 0:
|
||||||
|
self.num_updates += 1
|
||||||
|
decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
|
||||||
|
|
||||||
|
one_minus_decay = 1.0 - decay
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
m_param = dict(model.named_parameters())
|
||||||
|
shadow_params = dict(self.named_buffers())
|
||||||
|
|
||||||
|
for key in m_param:
|
||||||
|
if m_param[key].requires_grad:
|
||||||
|
sname = self.m_name2s_name[key]
|
||||||
|
shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
|
||||||
|
shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
|
||||||
|
else:
|
||||||
|
assert not key in self.m_name2s_name
|
||||||
|
|
||||||
|
def copy_to(self, model):
|
||||||
|
m_param = dict(model.named_parameters())
|
||||||
|
shadow_params = dict(self.named_buffers())
|
||||||
|
for key in m_param:
|
||||||
|
if m_param[key].requires_grad:
|
||||||
|
m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
|
||||||
|
else:
|
||||||
|
assert not key in self.m_name2s_name
|
||||||
|
|
||||||
|
def store(self, parameters):
|
||||||
|
"""
|
||||||
|
Save the current parameters for restoring later.
|
||||||
|
Args:
|
||||||
|
parameters: Iterable of `torch.nn.Parameter`; the parameters to be
|
||||||
|
temporarily stored.
|
||||||
|
"""
|
||||||
|
self.collected_params = [param.clone() for param in parameters]
|
||||||
|
|
||||||
|
def restore(self, parameters):
|
||||||
|
"""
|
||||||
|
Restore the parameters stored with the `store` method.
|
||||||
|
Useful to validate the model with EMA parameters without affecting the
|
||||||
|
original optimization process. Store the parameters before the
|
||||||
|
`copy_to` method. After validation (or model saving), use this to
|
||||||
|
restore the former parameters.
|
||||||
|
Args:
|
||||||
|
parameters: Iterable of `torch.nn.Parameter`; the parameters to be
|
||||||
|
updated with the stored parameters.
|
||||||
|
"""
|
||||||
|
for c_param, param in zip(self.collected_params, parameters):
|
||||||
|
param.data.copy_(c_param.data)
|
415
AnyText/AnyText_scripts/ldm/modules/encoders/modules.py
Normal file
@ -0,0 +1,415 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.utils.checkpoint import checkpoint
|
||||||
|
|
||||||
|
from transformers import T5Tokenizer, T5EncoderModel, CLIPTokenizer, CLIPTextModel, AutoProcessor, CLIPVisionModelWithProjection
|
||||||
|
|
||||||
|
import open_clip
|
||||||
|
from ...util import count_params
|
||||||
|
|
||||||
|
|
||||||
|
def _expand_mask(mask, dtype, tgt_len=None):
|
||||||
|
"""
|
||||||
|
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
|
||||||
|
"""
|
||||||
|
bsz, src_len = mask.size()
|
||||||
|
tgt_len = tgt_len if tgt_len is not None else src_len
|
||||||
|
|
||||||
|
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
|
||||||
|
|
||||||
|
inverted_mask = 1.0 - expanded_mask
|
||||||
|
|
||||||
|
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_causal_attention_mask(bsz, seq_len, dtype):
|
||||||
|
# lazily create causal attention mask, with full attention between the vision tokens
|
||||||
|
# pytorch uses additive attention mask; fill with -inf
|
||||||
|
mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
|
||||||
|
mask.fill_(torch.tensor(torch.finfo(dtype).min))
|
||||||
|
mask.triu_(1) # zero out the lower diagonal
|
||||||
|
mask = mask.unsqueeze(1) # expand mask
|
||||||
|
return mask
|
||||||
|
|
||||||
|
class AbstractEncoder(nn.Module):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def encode(self, *args, **kwargs):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class IdentityEncoder(AbstractEncoder):
|
||||||
|
|
||||||
|
def encode(self, x):
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class ClassEmbedder(nn.Module):
|
||||||
|
def __init__(self, embed_dim, n_classes=1000, key='class', ucg_rate=0.1):
|
||||||
|
super().__init__()
|
||||||
|
self.key = key
|
||||||
|
self.embedding = nn.Embedding(n_classes, embed_dim)
|
||||||
|
self.n_classes = n_classes
|
||||||
|
self.ucg_rate = ucg_rate
|
||||||
|
|
||||||
|
def forward(self, batch, key=None, disable_dropout=False):
|
||||||
|
if key is None:
|
||||||
|
key = self.key
|
||||||
|
# this is for use in crossattn
|
||||||
|
c = batch[key][:, None]
|
||||||
|
if self.ucg_rate > 0. and not disable_dropout:
|
||||||
|
mask = 1. - torch.bernoulli(torch.ones_like(c) * self.ucg_rate)
|
||||||
|
c = mask * c + (1-mask) * torch.ones_like(c)*(self.n_classes-1)
|
||||||
|
c = c.long()
|
||||||
|
c = self.embedding(c)
|
||||||
|
return c
|
||||||
|
|
||||||
|
def get_unconditional_conditioning(self, bs, device="cuda"):
|
||||||
|
uc_class = self.n_classes - 1 # 1000 classes --> 0 ... 999, one extra class for ucg (class 1000)
|
||||||
|
uc = torch.ones((bs,), device=device) * uc_class
|
||||||
|
uc = {self.key: uc}
|
||||||
|
return uc
|
||||||
|
|
||||||
|
|
||||||
|
def disabled_train(self, mode=True):
|
||||||
|
"""Overwrite model.train with this function to make sure train/eval mode
|
||||||
|
does not change anymore."""
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
class FrozenT5Embedder(AbstractEncoder):
|
||||||
|
"""Uses the T5 transformer encoder for text"""
|
||||||
|
def __init__(self, version="google/t5-v1_1-large", device="cuda", max_length=77, freeze=True): # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
|
||||||
|
super().__init__()
|
||||||
|
self.tokenizer = T5Tokenizer.from_pretrained(version)
|
||||||
|
self.transformer = T5EncoderModel.from_pretrained(version)
|
||||||
|
self.device = device
|
||||||
|
self.max_length = max_length # TODO: typical value?
|
||||||
|
if freeze:
|
||||||
|
self.freeze()
|
||||||
|
|
||||||
|
def freeze(self):
|
||||||
|
self.transformer = self.transformer.eval()
|
||||||
|
#self.train = disabled_train
|
||||||
|
for param in self.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
def forward(self, text):
|
||||||
|
batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
|
||||||
|
return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
|
||||||
|
tokens = batch_encoding["input_ids"].to(self.device)
|
||||||
|
outputs = self.transformer(input_ids=tokens)
|
||||||
|
|
||||||
|
z = outputs.last_hidden_state
|
||||||
|
return z
|
||||||
|
|
||||||
|
def encode(self, text):
|
||||||
|
return self(text)
|
||||||
|
|
||||||
|
|
||||||
|
class FrozenCLIPEmbedder(AbstractEncoder):
|
||||||
|
"""Uses the CLIP transformer encoder for text (from huggingface)"""
|
||||||
|
LAYERS = [
|
||||||
|
"last",
|
||||||
|
"pooled",
|
||||||
|
"hidden"
|
||||||
|
]
|
||||||
|
def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77,
|
||||||
|
freeze=True, layer="last", layer_idx=None): # clip-vit-base-patch32
|
||||||
|
super().__init__()
|
||||||
|
assert layer in self.LAYERS
|
||||||
|
self.tokenizer = CLIPTokenizer.from_pretrained(version)
|
||||||
|
self.transformer = CLIPTextModel.from_pretrained(version)
|
||||||
|
self.device = device
|
||||||
|
self.max_length = max_length
|
||||||
|
if freeze:
|
||||||
|
self.freeze()
|
||||||
|
self.layer = layer
|
||||||
|
self.layer_idx = layer_idx
|
||||||
|
if layer == "hidden":
|
||||||
|
assert layer_idx is not None
|
||||||
|
assert 0 <= abs(layer_idx) <= 12
|
||||||
|
|
||||||
|
def freeze(self):
|
||||||
|
self.transformer = self.transformer.eval()
|
||||||
|
# self.train = disabled_train
|
||||||
|
for param in self.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
def forward(self, text):
|
||||||
|
batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
|
||||||
|
return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
|
||||||
|
tokens = batch_encoding["input_ids"].to(self.device)
|
||||||
|
outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer=="hidden")
|
||||||
|
if self.layer == "last":
|
||||||
|
z = outputs.last_hidden_state
|
||||||
|
elif self.layer == "pooled":
|
||||||
|
z = outputs.pooler_output[:, None, :]
|
||||||
|
else:
|
||||||
|
z = outputs.hidden_states[self.layer_idx]
|
||||||
|
return z
|
||||||
|
|
||||||
|
def encode(self, text):
|
||||||
|
return self(text)
|
||||||
|
|
||||||
|
|
||||||
|
class FrozenOpenCLIPEmbedder(AbstractEncoder):
|
||||||
|
"""
|
||||||
|
Uses the OpenCLIP transformer encoder for text
|
||||||
|
"""
|
||||||
|
LAYERS = [
|
||||||
|
# "pooled",
|
||||||
|
"last",
|
||||||
|
"penultimate"
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77,
|
||||||
|
freeze=True, layer="last"):
|
||||||
|
super().__init__()
|
||||||
|
assert layer in self.LAYERS
|
||||||
|
model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=version)
|
||||||
|
del model.visual
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
self.device = device
|
||||||
|
self.max_length = max_length
|
||||||
|
if freeze:
|
||||||
|
self.freeze()
|
||||||
|
self.layer = layer
|
||||||
|
if self.layer == "last":
|
||||||
|
self.layer_idx = 0
|
||||||
|
elif self.layer == "penultimate":
|
||||||
|
self.layer_idx = 1
|
||||||
|
else:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def freeze(self):
|
||||||
|
self.model = self.model.eval()
|
||||||
|
for param in self.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
def forward(self, text):
|
||||||
|
tokens = open_clip.tokenize(text)
|
||||||
|
z = self.encode_with_transformer(tokens.to(self.device))
|
||||||
|
return z
|
||||||
|
|
||||||
|
def encode_with_transformer(self, text):
|
||||||
|
x = self.model.token_embedding(text) # [batch_size, n_ctx, d_model]
|
||||||
|
x = x + self.model.positional_embedding
|
||||||
|
x = x.permute(1, 0, 2) # NLD -> LND
|
||||||
|
x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
|
||||||
|
x = x.permute(1, 0, 2) # LND -> NLD
|
||||||
|
x = self.model.ln_final(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def text_transformer_forward(self, x: torch.Tensor, attn_mask=None):
|
||||||
|
for i, r in enumerate(self.model.transformer.resblocks):
|
||||||
|
if i == len(self.model.transformer.resblocks) - self.layer_idx:
|
||||||
|
break
|
||||||
|
if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting():
|
||||||
|
x = checkpoint(r, x, attn_mask)
|
||||||
|
else:
|
||||||
|
x = r(x, attn_mask=attn_mask)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def encode(self, text):
|
||||||
|
return self(text)
|
||||||
|
|
||||||
|
|
||||||
|
class FrozenCLIPT5Encoder(AbstractEncoder):
|
||||||
|
def __init__(self, clip_version="openai/clip-vit-large-patch14", t5_version="google/t5-v1_1-xl", device="cuda",
|
||||||
|
clip_max_length=77, t5_max_length=77):
|
||||||
|
super().__init__()
|
||||||
|
self.clip_encoder = FrozenCLIPEmbedder(clip_version, device, max_length=clip_max_length)
|
||||||
|
self.t5_encoder = FrozenT5Embedder(t5_version, device, max_length=t5_max_length)
|
||||||
|
print(f"{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder)*1.e-6:.2f} M parameters, "
|
||||||
|
f"{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder)*1.e-6:.2f} M params.")
|
||||||
|
|
||||||
|
def encode(self, text):
|
||||||
|
return self(text)
|
||||||
|
|
||||||
|
def forward(self, text):
|
||||||
|
clip_z = self.clip_encoder.encode(text)
|
||||||
|
t5_z = self.t5_encoder.encode(text)
|
||||||
|
return [clip_z, t5_z]
|
||||||
|
|
||||||
|
|
||||||
|
class FrozenCLIPEmbedderT3(AbstractEncoder):
|
||||||
|
"""Uses the CLIP transformer encoder for text (from Hugging Face)"""
|
||||||
|
def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77, freeze=True, use_vision=False):
|
||||||
|
super().__init__()
|
||||||
|
self.tokenizer = CLIPTokenizer.from_pretrained(version)
|
||||||
|
self.transformer = CLIPTextModel.from_pretrained(version)
|
||||||
|
if use_vision:
|
||||||
|
self.vit = CLIPVisionModelWithProjection.from_pretrained(version)
|
||||||
|
self.processor = AutoProcessor.from_pretrained(version)
|
||||||
|
self.device = device
|
||||||
|
self.max_length = max_length
|
||||||
|
if freeze:
|
||||||
|
self.freeze()
|
||||||
|
|
||||||
|
def embedding_forward(
|
||||||
|
self,
|
||||||
|
input_ids=None,
|
||||||
|
position_ids=None,
|
||||||
|
inputs_embeds=None,
|
||||||
|
embedding_manager=None,
|
||||||
|
):
|
||||||
|
seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
|
||||||
|
if position_ids is None:
|
||||||
|
position_ids = self.position_ids[:, :seq_length]
|
||||||
|
if inputs_embeds is None:
|
||||||
|
inputs_embeds = self.token_embedding(input_ids)
|
||||||
|
if embedding_manager is not None:
|
||||||
|
inputs_embeds = embedding_manager(input_ids, inputs_embeds)
|
||||||
|
position_embeddings = self.position_embedding(position_ids)
|
||||||
|
embeddings = inputs_embeds + position_embeddings
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
self.transformer.text_model.embeddings.forward = embedding_forward.__get__(self.transformer.text_model.embeddings)
|
||||||
|
|
||||||
|
def encoder_forward(
|
||||||
|
self,
|
||||||
|
inputs_embeds,
|
||||||
|
attention_mask=None,
|
||||||
|
causal_attention_mask=None,
|
||||||
|
output_attentions=None,
|
||||||
|
output_hidden_states=None,
|
||||||
|
return_dict=None,
|
||||||
|
):
|
||||||
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
|
output_hidden_states = (
|
||||||
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
|
)
|
||||||
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
encoder_states = () if output_hidden_states else None
|
||||||
|
all_attentions = () if output_attentions else None
|
||||||
|
hidden_states = inputs_embeds
|
||||||
|
for idx, encoder_layer in enumerate(self.layers):
|
||||||
|
if output_hidden_states:
|
||||||
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
|
layer_outputs = encoder_layer(
|
||||||
|
hidden_states,
|
||||||
|
attention_mask,
|
||||||
|
causal_attention_mask,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
)
|
||||||
|
hidden_states = layer_outputs[0]
|
||||||
|
if output_attentions:
|
||||||
|
all_attentions = all_attentions + (layer_outputs[1],)
|
||||||
|
if output_hidden_states:
|
||||||
|
encoder_states = encoder_states + (hidden_states,)
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
self.transformer.text_model.encoder.forward = encoder_forward.__get__(self.transformer.text_model.encoder)
|
||||||
|
|
||||||
|
def text_encoder_forward(
|
||||||
|
self,
|
||||||
|
input_ids=None,
|
||||||
|
attention_mask=None,
|
||||||
|
position_ids=None,
|
||||||
|
output_attentions=None,
|
||||||
|
output_hidden_states=None,
|
||||||
|
return_dict=None,
|
||||||
|
embedding_manager=None,
|
||||||
|
):
|
||||||
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
|
output_hidden_states = (
|
||||||
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
|
)
|
||||||
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
if input_ids is None:
|
||||||
|
raise ValueError("You have to specify either input_ids")
|
||||||
|
input_shape = input_ids.size()
|
||||||
|
input_ids = input_ids.view(-1, input_shape[-1])
|
||||||
|
hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids, embedding_manager=embedding_manager)
|
||||||
|
bsz, seq_len = input_shape
|
||||||
|
# CLIP's text model uses causal mask, prepare it here.
|
||||||
|
# https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
|
||||||
|
causal_attention_mask = _build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
|
||||||
|
hidden_states.device
|
||||||
|
)
|
||||||
|
# expand attention_mask
|
||||||
|
if attention_mask is not None:
|
||||||
|
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
||||||
|
attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
|
||||||
|
last_hidden_state = self.encoder(
|
||||||
|
inputs_embeds=hidden_states,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
causal_attention_mask=causal_attention_mask,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
output_hidden_states=output_hidden_states,
|
||||||
|
return_dict=return_dict,
|
||||||
|
)
|
||||||
|
last_hidden_state = self.final_layer_norm(last_hidden_state)
|
||||||
|
return last_hidden_state
|
||||||
|
|
||||||
|
self.transformer.text_model.forward = text_encoder_forward.__get__(self.transformer.text_model)
|
||||||
|
|
||||||
|
def transformer_forward(
|
||||||
|
self,
|
||||||
|
input_ids=None,
|
||||||
|
attention_mask=None,
|
||||||
|
position_ids=None,
|
||||||
|
output_attentions=None,
|
||||||
|
output_hidden_states=None,
|
||||||
|
return_dict=None,
|
||||||
|
embedding_manager=None,
|
||||||
|
):
|
||||||
|
return self.text_model(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
output_hidden_states=output_hidden_states,
|
||||||
|
return_dict=return_dict,
|
||||||
|
embedding_manager=embedding_manager
|
||||||
|
)
|
||||||
|
|
||||||
|
self.transformer.forward = transformer_forward.__get__(self.transformer)
|
||||||
|
|
||||||
|
def freeze(self):
|
||||||
|
self.transformer = self.transformer.eval()
|
||||||
|
for param in self.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
def forward(self, text, **kwargs):
|
||||||
|
batch_encoding = self.tokenizer(text, truncation=False, max_length=self.max_length, return_length=True,
|
||||||
|
return_overflowing_tokens=False, padding="longest", return_tensors="pt")
|
||||||
|
input_ids = batch_encoding["input_ids"]
|
||||||
|
tokens_list = self.split_chunks(input_ids)
|
||||||
|
z_list = []
|
||||||
|
for tokens in tokens_list:
|
||||||
|
tokens = tokens.to(self.device)
|
||||||
|
_z = self.transformer(input_ids=tokens, **kwargs)
|
||||||
|
z_list += [_z]
|
||||||
|
return torch.cat(z_list, dim=1)
|
||||||
|
|
||||||
|
def encode(self, text, **kwargs):
|
||||||
|
return self(text, **kwargs)
|
||||||
|
|
||||||
|
def split_chunks(self, input_ids, chunk_size=75):
|
||||||
|
tokens_list = []
|
||||||
|
bs, n = input_ids.shape
|
||||||
|
id_start = input_ids[:, 0].unsqueeze(1) # dim --> [bs, 1]
|
||||||
|
id_end = input_ids[:, -1].unsqueeze(1)
|
||||||
|
if n == 2: # empty caption
|
||||||
|
tokens_list.append(torch.cat((id_start, )+(id_end, )*(chunk_size+1), dim=1))
|
||||||
|
|
||||||
|
trimmed_encoding = input_ids[:, 1:-1]
|
||||||
|
num_full_groups = (n - 2) // chunk_size
|
||||||
|
|
||||||
|
for i in range(num_full_groups):
|
||||||
|
group = trimmed_encoding[:, i * chunk_size:(i + 1) * chunk_size]
|
||||||
|
group_pad = torch.cat((id_start, group, id_end), dim=1)
|
||||||
|
tokens_list.append(group_pad)
|
||||||
|
|
||||||
|
remaining_columns = (n - 2) % chunk_size
|
||||||
|
if remaining_columns > 0:
|
||||||
|
remaining_group = trimmed_encoding[:, -remaining_columns:]
|
||||||
|
padding_columns = chunk_size - remaining_group.shape[1]
|
||||||
|
padding = id_end.expand(bs, padding_columns)
|
||||||
|
remaining_group_pad = torch.cat((id_start, remaining_group, padding, id_end), dim=1)
|
||||||
|
tokens_list.append(remaining_group_pad)
|
||||||
|
return tokens_list
|
@ -0,0 +1,2 @@
|
|||||||
|
from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
|
||||||
|
from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
|
730
AnyText/AnyText_scripts/ldm/modules/image_degradation/bsrgan.py
Normal file
@ -0,0 +1,730 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
# --------------------------------------------
|
||||||
|
# Super-Resolution
|
||||||
|
# --------------------------------------------
|
||||||
|
#
|
||||||
|
# Kai Zhang (cskaizhang@gmail.com)
|
||||||
|
# https://github.com/cszn
|
||||||
|
# From 2019/03--2021/08
|
||||||
|
# --------------------------------------------
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import cv2
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from functools import partial
|
||||||
|
import random
|
||||||
|
from scipy import ndimage
|
||||||
|
import scipy
|
||||||
|
import scipy.stats as ss
|
||||||
|
from scipy.interpolate import interp2d
|
||||||
|
from scipy.linalg import orth
|
||||||
|
import albumentations
|
||||||
|
|
||||||
|
from . import utils_image as util
|
||||||
|
|
||||||
|
|
||||||
|
def modcrop_np(img, sf):
|
||||||
|
'''
|
||||||
|
Args:
|
||||||
|
img: numpy image, WxH or WxHxC
|
||||||
|
sf: scale factor
|
||||||
|
Return:
|
||||||
|
cropped image
|
||||||
|
'''
|
||||||
|
w, h = img.shape[:2]
|
||||||
|
im = np.copy(img)
|
||||||
|
return im[:w - w % sf, :h - h % sf, ...]
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
# --------------------------------------------
|
||||||
|
# anisotropic Gaussian kernels
|
||||||
|
# --------------------------------------------
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def analytic_kernel(k):
|
||||||
|
"""Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
|
||||||
|
k_size = k.shape[0]
|
||||||
|
# Calculate the big kernels size
|
||||||
|
big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
|
||||||
|
# Loop over the small kernel to fill the big one
|
||||||
|
for r in range(k_size):
|
||||||
|
for c in range(k_size):
|
||||||
|
big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
|
||||||
|
# Crop the edges of the big kernel to ignore very small values and increase run time of SR
|
||||||
|
crop = k_size // 2
|
||||||
|
cropped_big_k = big_k[crop:-crop, crop:-crop]
|
||||||
|
# Normalize to 1
|
||||||
|
return cropped_big_k / cropped_big_k.sum()
|
||||||
|
|
||||||
|
|
||||||
|
def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
|
||||||
|
""" generate an anisotropic Gaussian kernel
|
||||||
|
Args:
|
||||||
|
ksize : e.g., 15, kernel size
|
||||||
|
theta : [0, pi], rotation angle range
|
||||||
|
l1 : [0.1,50], scaling of eigenvalues
|
||||||
|
l2 : [0.1,l1], scaling of eigenvalues
|
||||||
|
If l1 = l2, will get an isotropic Gaussian kernel.
|
||||||
|
Returns:
|
||||||
|
k : kernel
|
||||||
|
"""
|
||||||
|
|
||||||
|
v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
|
||||||
|
V = np.array([[v[0], v[1]], [v[1], -v[0]]])
|
||||||
|
D = np.array([[l1, 0], [0, l2]])
|
||||||
|
Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
|
||||||
|
k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
|
||||||
|
|
||||||
|
return k
|
||||||
|
|
||||||
|
|
||||||
|
def gm_blur_kernel(mean, cov, size=15):
|
||||||
|
center = size / 2.0 + 0.5
|
||||||
|
k = np.zeros([size, size])
|
||||||
|
for y in range(size):
|
||||||
|
for x in range(size):
|
||||||
|
cy = y - center + 1
|
||||||
|
cx = x - center + 1
|
||||||
|
k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
|
||||||
|
|
||||||
|
k = k / np.sum(k)
|
||||||
|
return k
|
||||||
|
|
||||||
|
|
||||||
|
def shift_pixel(x, sf, upper_left=True):
|
||||||
|
"""shift pixel for super-resolution with different scale factors
|
||||||
|
Args:
|
||||||
|
x: WxHxC or WxH
|
||||||
|
sf: scale factor
|
||||||
|
upper_left: shift direction
|
||||||
|
"""
|
||||||
|
h, w = x.shape[:2]
|
||||||
|
shift = (sf - 1) * 0.5
|
||||||
|
xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
|
||||||
|
if upper_left:
|
||||||
|
x1 = xv + shift
|
||||||
|
y1 = yv + shift
|
||||||
|
else:
|
||||||
|
x1 = xv - shift
|
||||||
|
y1 = yv - shift
|
||||||
|
|
||||||
|
x1 = np.clip(x1, 0, w - 1)
|
||||||
|
y1 = np.clip(y1, 0, h - 1)
|
||||||
|
|
||||||
|
if x.ndim == 2:
|
||||||
|
x = interp2d(xv, yv, x)(x1, y1)
|
||||||
|
if x.ndim == 3:
|
||||||
|
for i in range(x.shape[-1]):
|
||||||
|
x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def blur(x, k):
|
||||||
|
'''
|
||||||
|
x: image, NxcxHxW
|
||||||
|
k: kernel, Nx1xhxw
|
||||||
|
'''
|
||||||
|
n, c = x.shape[:2]
|
||||||
|
p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
|
||||||
|
x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
|
||||||
|
k = k.repeat(1, c, 1, 1)
|
||||||
|
k = k.view(-1, 1, k.shape[2], k.shape[3])
|
||||||
|
x = x.view(1, -1, x.shape[2], x.shape[3])
|
||||||
|
x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
|
||||||
|
x = x.view(n, c, x.shape[2], x.shape[3])
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
|
||||||
|
""""
|
||||||
|
# modified version of https://github.com/assafshocher/BlindSR_dataset_generator
|
||||||
|
# Kai Zhang
|
||||||
|
# min_var = 0.175 * sf # variance of the gaussian kernel will be sampled between min_var and max_var
|
||||||
|
# max_var = 2.5 * sf
|
||||||
|
"""
|
||||||
|
# Set random eigen-vals (lambdas) and angle (theta) for COV matrix
|
||||||
|
lambda_1 = min_var + np.random.rand() * (max_var - min_var)
|
||||||
|
lambda_2 = min_var + np.random.rand() * (max_var - min_var)
|
||||||
|
theta = np.random.rand() * np.pi # random theta
|
||||||
|
noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
|
||||||
|
|
||||||
|
# Set COV matrix using Lambdas and Theta
|
||||||
|
LAMBDA = np.diag([lambda_1, lambda_2])
|
||||||
|
Q = np.array([[np.cos(theta), -np.sin(theta)],
|
||||||
|
[np.sin(theta), np.cos(theta)]])
|
||||||
|
SIGMA = Q @ LAMBDA @ Q.T
|
||||||
|
INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
|
||||||
|
|
||||||
|
# Set expectation position (shifting kernel for aligned image)
|
||||||
|
MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2)
|
||||||
|
MU = MU[None, None, :, None]
|
||||||
|
|
||||||
|
# Create meshgrid for Gaussian
|
||||||
|
[X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
|
||||||
|
Z = np.stack([X, Y], 2)[:, :, :, None]
|
||||||
|
|
||||||
|
# Calcualte Gaussian for every pixel of the kernel
|
||||||
|
ZZ = Z - MU
|
||||||
|
ZZ_t = ZZ.transpose(0, 1, 3, 2)
|
||||||
|
raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
|
||||||
|
|
||||||
|
# shift the kernel so it will be centered
|
||||||
|
# raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
|
||||||
|
|
||||||
|
# Normalize the kernel and return
|
||||||
|
# kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
|
||||||
|
kernel = raw_kernel / np.sum(raw_kernel)
|
||||||
|
return kernel
|
||||||
|
|
||||||
|
|
||||||
|
def fspecial_gaussian(hsize, sigma):
|
||||||
|
hsize = [hsize, hsize]
|
||||||
|
siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
|
||||||
|
std = sigma
|
||||||
|
[x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
|
||||||
|
arg = -(x * x + y * y) / (2 * std * std)
|
||||||
|
h = np.exp(arg)
|
||||||
|
h[h < scipy.finfo(float).eps * h.max()] = 0
|
||||||
|
sumh = h.sum()
|
||||||
|
if sumh != 0:
|
||||||
|
h = h / sumh
|
||||||
|
return h
|
||||||
|
|
||||||
|
|
||||||
|
def fspecial_laplacian(alpha):
|
||||||
|
alpha = max([0, min([alpha, 1])])
|
||||||
|
h1 = alpha / (alpha + 1)
|
||||||
|
h2 = (1 - alpha) / (alpha + 1)
|
||||||
|
h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
|
||||||
|
h = np.array(h)
|
||||||
|
return h
|
||||||
|
|
||||||
|
|
||||||
|
def fspecial(filter_type, *args, **kwargs):
|
||||||
|
'''
|
||||||
|
python code from:
|
||||||
|
https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
|
||||||
|
'''
|
||||||
|
if filter_type == 'gaussian':
|
||||||
|
return fspecial_gaussian(*args, **kwargs)
|
||||||
|
if filter_type == 'laplacian':
|
||||||
|
return fspecial_laplacian(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
# --------------------------------------------
|
||||||
|
# degradation models
|
||||||
|
# --------------------------------------------
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def bicubic_degradation(x, sf=3):
|
||||||
|
'''
|
||||||
|
Args:
|
||||||
|
x: HxWxC image, [0, 1]
|
||||||
|
sf: down-scale factor
|
||||||
|
Return:
|
||||||
|
bicubicly downsampled LR image
|
||||||
|
'''
|
||||||
|
x = util.imresize_np(x, scale=1 / sf)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def srmd_degradation(x, k, sf=3):
|
||||||
|
''' blur + bicubic downsampling
|
||||||
|
Args:
|
||||||
|
x: HxWxC image, [0, 1]
|
||||||
|
k: hxw, double
|
||||||
|
sf: down-scale factor
|
||||||
|
Return:
|
||||||
|
downsampled LR image
|
||||||
|
Reference:
|
||||||
|
@inproceedings{zhang2018learning,
|
||||||
|
title={Learning a single convolutional super-resolution network for multiple degradations},
|
||||||
|
author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
|
||||||
|
booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
|
||||||
|
pages={3262--3271},
|
||||||
|
year={2018}
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap') # 'nearest' | 'mirror'
|
||||||
|
x = bicubic_degradation(x, sf=sf)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def dpsr_degradation(x, k, sf=3):
|
||||||
|
''' bicubic downsampling + blur
|
||||||
|
Args:
|
||||||
|
x: HxWxC image, [0, 1]
|
||||||
|
k: hxw, double
|
||||||
|
sf: down-scale factor
|
||||||
|
Return:
|
||||||
|
downsampled LR image
|
||||||
|
Reference:
|
||||||
|
@inproceedings{zhang2019deep,
|
||||||
|
title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
|
||||||
|
author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
|
||||||
|
booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
|
||||||
|
pages={1671--1681},
|
||||||
|
year={2019}
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
x = bicubic_degradation(x, sf=sf)
|
||||||
|
x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def classical_degradation(x, k, sf=3):
|
||||||
|
''' blur + downsampling
|
||||||
|
Args:
|
||||||
|
x: HxWxC image, [0, 1]/[0, 255]
|
||||||
|
k: hxw, double
|
||||||
|
sf: down-scale factor
|
||||||
|
Return:
|
||||||
|
downsampled LR image
|
||||||
|
'''
|
||||||
|
x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
|
||||||
|
# x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
|
||||||
|
st = 0
|
||||||
|
return x[st::sf, st::sf, ...]
|
||||||
|
|
||||||
|
|
||||||
|
def add_sharpening(img, weight=0.5, radius=50, threshold=10):
|
||||||
|
"""USM sharpening. borrowed from real-ESRGAN
|
||||||
|
Input image: I; Blurry image: B.
|
||||||
|
1. K = I + weight * (I - B)
|
||||||
|
2. Mask = 1 if abs(I - B) > threshold, else: 0
|
||||||
|
3. Blur mask:
|
||||||
|
4. Out = Mask * K + (1 - Mask) * I
|
||||||
|
Args:
|
||||||
|
img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
|
||||||
|
weight (float): Sharp weight. Default: 1.
|
||||||
|
radius (float): Kernel size of Gaussian blur. Default: 50.
|
||||||
|
threshold (int):
|
||||||
|
"""
|
||||||
|
if radius % 2 == 0:
|
||||||
|
radius += 1
|
||||||
|
blur = cv2.GaussianBlur(img, (radius, radius), 0)
|
||||||
|
residual = img - blur
|
||||||
|
mask = np.abs(residual) * 255 > threshold
|
||||||
|
mask = mask.astype('float32')
|
||||||
|
soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
|
||||||
|
|
||||||
|
K = img + weight * residual
|
||||||
|
K = np.clip(K, 0, 1)
|
||||||
|
return soft_mask * K + (1 - soft_mask) * img
|
||||||
|
|
||||||
|
|
||||||
|
def add_blur(img, sf=4):
|
||||||
|
wd2 = 4.0 + sf
|
||||||
|
wd = 2.0 + 0.2 * sf
|
||||||
|
if random.random() < 0.5:
|
||||||
|
l1 = wd2 * random.random()
|
||||||
|
l2 = wd2 * random.random()
|
||||||
|
k = anisotropic_Gaussian(ksize=2 * random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
|
||||||
|
else:
|
||||||
|
k = fspecial('gaussian', 2 * random.randint(2, 11) + 3, wd * random.random())
|
||||||
|
img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
|
||||||
|
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def add_resize(img, sf=4):
|
||||||
|
rnum = np.random.rand()
|
||||||
|
if rnum > 0.8: # up
|
||||||
|
sf1 = random.uniform(1, 2)
|
||||||
|
elif rnum < 0.7: # down
|
||||||
|
sf1 = random.uniform(0.5 / sf, 1)
|
||||||
|
else:
|
||||||
|
sf1 = 1.0
|
||||||
|
img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
# def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
|
||||||
|
# noise_level = random.randint(noise_level1, noise_level2)
|
||||||
|
# rnum = np.random.rand()
|
||||||
|
# if rnum > 0.6: # add color Gaussian noise
|
||||||
|
# img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
|
||||||
|
# elif rnum < 0.4: # add grayscale Gaussian noise
|
||||||
|
# img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
|
||||||
|
# else: # add noise
|
||||||
|
# L = noise_level2 / 255.
|
||||||
|
# D = np.diag(np.random.rand(3))
|
||||||
|
# U = orth(np.random.rand(3, 3))
|
||||||
|
# conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||||
|
# img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||||
|
# img = np.clip(img, 0.0, 1.0)
|
||||||
|
# return img
|
||||||
|
|
||||||
|
def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
|
||||||
|
noise_level = random.randint(noise_level1, noise_level2)
|
||||||
|
rnum = np.random.rand()
|
||||||
|
if rnum > 0.6: # add color Gaussian noise
|
||||||
|
img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
|
||||||
|
elif rnum < 0.4: # add grayscale Gaussian noise
|
||||||
|
img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
|
||||||
|
else: # add noise
|
||||||
|
L = noise_level2 / 255.
|
||||||
|
D = np.diag(np.random.rand(3))
|
||||||
|
U = orth(np.random.rand(3, 3))
|
||||||
|
conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||||
|
img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def add_speckle_noise(img, noise_level1=2, noise_level2=25):
|
||||||
|
noise_level = random.randint(noise_level1, noise_level2)
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
rnum = random.random()
|
||||||
|
if rnum > 0.6:
|
||||||
|
img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
|
||||||
|
elif rnum < 0.4:
|
||||||
|
img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
|
||||||
|
else:
|
||||||
|
L = noise_level2 / 255.
|
||||||
|
D = np.diag(np.random.rand(3))
|
||||||
|
U = orth(np.random.rand(3, 3))
|
||||||
|
conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||||
|
img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def add_Poisson_noise(img):
|
||||||
|
img = np.clip((img * 255.0).round(), 0, 255) / 255.
|
||||||
|
vals = 10 ** (2 * random.random() + 2.0) # [2, 4]
|
||||||
|
if random.random() < 0.5:
|
||||||
|
img = np.random.poisson(img * vals).astype(np.float32) / vals
|
||||||
|
else:
|
||||||
|
img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
|
||||||
|
img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
|
||||||
|
noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
|
||||||
|
img += noise_gray[:, :, np.newaxis]
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def add_JPEG_noise(img):
|
||||||
|
quality_factor = random.randint(30, 95)
|
||||||
|
img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
|
||||||
|
result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
|
||||||
|
img = cv2.imdecode(encimg, 1)
|
||||||
|
img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def random_crop(lq, hq, sf=4, lq_patchsize=64):
|
||||||
|
h, w = lq.shape[:2]
|
||||||
|
rnd_h = random.randint(0, h - lq_patchsize)
|
||||||
|
rnd_w = random.randint(0, w - lq_patchsize)
|
||||||
|
lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
|
||||||
|
|
||||||
|
rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
|
||||||
|
hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
|
||||||
|
return lq, hq
|
||||||
|
|
||||||
|
|
||||||
|
def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
|
||||||
|
"""
|
||||||
|
This is the degradation model of BSRGAN from the paper
|
||||||
|
"Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
|
||||||
|
----------
|
||||||
|
img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
|
||||||
|
sf: scale factor
|
||||||
|
isp_model: camera ISP model
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
|
||||||
|
hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
|
||||||
|
"""
|
||||||
|
isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
|
||||||
|
sf_ori = sf
|
||||||
|
|
||||||
|
h1, w1 = img.shape[:2]
|
||||||
|
img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
|
||||||
|
h, w = img.shape[:2]
|
||||||
|
|
||||||
|
if h < lq_patchsize * sf or w < lq_patchsize * sf:
|
||||||
|
raise ValueError(f'img size ({h1}X{w1}) is too small!')
|
||||||
|
|
||||||
|
hq = img.copy()
|
||||||
|
|
||||||
|
if sf == 4 and random.random() < scale2_prob: # downsample1
|
||||||
|
if np.random.rand() < 0.5:
|
||||||
|
img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
|
||||||
|
interpolation=random.choice([1, 2, 3]))
|
||||||
|
else:
|
||||||
|
img = util.imresize_np(img, 1 / 2, True)
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
sf = 2
|
||||||
|
|
||||||
|
shuffle_order = random.sample(range(7), 7)
|
||||||
|
idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
|
||||||
|
if idx1 > idx2: # keep downsample3 last
|
||||||
|
shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
|
||||||
|
|
||||||
|
for i in shuffle_order:
|
||||||
|
|
||||||
|
if i == 0:
|
||||||
|
img = add_blur(img, sf=sf)
|
||||||
|
|
||||||
|
elif i == 1:
|
||||||
|
img = add_blur(img, sf=sf)
|
||||||
|
|
||||||
|
elif i == 2:
|
||||||
|
a, b = img.shape[1], img.shape[0]
|
||||||
|
# downsample2
|
||||||
|
if random.random() < 0.75:
|
||||||
|
sf1 = random.uniform(1, 2 * sf)
|
||||||
|
img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
|
||||||
|
interpolation=random.choice([1, 2, 3]))
|
||||||
|
else:
|
||||||
|
k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
|
||||||
|
k_shifted = shift_pixel(k, sf)
|
||||||
|
k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
|
||||||
|
img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
|
||||||
|
img = img[0::sf, 0::sf, ...] # nearest downsampling
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
|
||||||
|
elif i == 3:
|
||||||
|
# downsample3
|
||||||
|
img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
|
||||||
|
elif i == 4:
|
||||||
|
# add Gaussian noise
|
||||||
|
img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
|
||||||
|
|
||||||
|
elif i == 5:
|
||||||
|
# add JPEG noise
|
||||||
|
if random.random() < jpeg_prob:
|
||||||
|
img = add_JPEG_noise(img)
|
||||||
|
|
||||||
|
elif i == 6:
|
||||||
|
# add processed camera sensor noise
|
||||||
|
if random.random() < isp_prob and isp_model is not None:
|
||||||
|
with torch.no_grad():
|
||||||
|
img, hq = isp_model.forward(img.copy(), hq)
|
||||||
|
|
||||||
|
# add final JPEG compression noise
|
||||||
|
img = add_JPEG_noise(img)
|
||||||
|
|
||||||
|
# random crop
|
||||||
|
img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
|
||||||
|
|
||||||
|
return img, hq
|
||||||
|
|
||||||
|
|
||||||
|
# todo no isp_model?
|
||||||
|
def degradation_bsrgan_variant(image, sf=4, isp_model=None):
|
||||||
|
"""
|
||||||
|
This is the degradation model of BSRGAN from the paper
|
||||||
|
"Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
|
||||||
|
----------
|
||||||
|
sf: scale factor
|
||||||
|
isp_model: camera ISP model
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
|
||||||
|
hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
|
||||||
|
"""
|
||||||
|
image = util.uint2single(image)
|
||||||
|
isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
|
||||||
|
sf_ori = sf
|
||||||
|
|
||||||
|
h1, w1 = image.shape[:2]
|
||||||
|
image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
|
||||||
|
h, w = image.shape[:2]
|
||||||
|
|
||||||
|
hq = image.copy()
|
||||||
|
|
||||||
|
if sf == 4 and random.random() < scale2_prob: # downsample1
|
||||||
|
if np.random.rand() < 0.5:
|
||||||
|
image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
|
||||||
|
interpolation=random.choice([1, 2, 3]))
|
||||||
|
else:
|
||||||
|
image = util.imresize_np(image, 1 / 2, True)
|
||||||
|
image = np.clip(image, 0.0, 1.0)
|
||||||
|
sf = 2
|
||||||
|
|
||||||
|
shuffle_order = random.sample(range(7), 7)
|
||||||
|
idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
|
||||||
|
if idx1 > idx2: # keep downsample3 last
|
||||||
|
shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
|
||||||
|
|
||||||
|
for i in shuffle_order:
|
||||||
|
|
||||||
|
if i == 0:
|
||||||
|
image = add_blur(image, sf=sf)
|
||||||
|
|
||||||
|
elif i == 1:
|
||||||
|
image = add_blur(image, sf=sf)
|
||||||
|
|
||||||
|
elif i == 2:
|
||||||
|
a, b = image.shape[1], image.shape[0]
|
||||||
|
# downsample2
|
||||||
|
if random.random() < 0.75:
|
||||||
|
sf1 = random.uniform(1, 2 * sf)
|
||||||
|
image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
|
||||||
|
interpolation=random.choice([1, 2, 3]))
|
||||||
|
else:
|
||||||
|
k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
|
||||||
|
k_shifted = shift_pixel(k, sf)
|
||||||
|
k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
|
||||||
|
image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
|
||||||
|
image = image[0::sf, 0::sf, ...] # nearest downsampling
|
||||||
|
image = np.clip(image, 0.0, 1.0)
|
||||||
|
|
||||||
|
elif i == 3:
|
||||||
|
# downsample3
|
||||||
|
image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
|
||||||
|
image = np.clip(image, 0.0, 1.0)
|
||||||
|
|
||||||
|
elif i == 4:
|
||||||
|
# add Gaussian noise
|
||||||
|
image = add_Gaussian_noise(image, noise_level1=2, noise_level2=25)
|
||||||
|
|
||||||
|
elif i == 5:
|
||||||
|
# add JPEG noise
|
||||||
|
if random.random() < jpeg_prob:
|
||||||
|
image = add_JPEG_noise(image)
|
||||||
|
|
||||||
|
# elif i == 6:
|
||||||
|
# # add processed camera sensor noise
|
||||||
|
# if random.random() < isp_prob and isp_model is not None:
|
||||||
|
# with torch.no_grad():
|
||||||
|
# img, hq = isp_model.forward(img.copy(), hq)
|
||||||
|
|
||||||
|
# add final JPEG compression noise
|
||||||
|
image = add_JPEG_noise(image)
|
||||||
|
image = util.single2uint(image)
|
||||||
|
example = {"image":image}
|
||||||
|
return example
|
||||||
|
|
||||||
|
|
||||||
|
# TODO incase there is a pickle error one needs to replace a += x with a = a + x in add_speckle_noise etc...
|
||||||
|
def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patchsize=64, isp_model=None):
|
||||||
|
"""
|
||||||
|
This is an extended degradation model by combining
|
||||||
|
the degradation models of BSRGAN and Real-ESRGAN
|
||||||
|
----------
|
||||||
|
img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
|
||||||
|
sf: scale factor
|
||||||
|
use_shuffle: the degradation shuffle
|
||||||
|
use_sharp: sharpening the img
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
|
||||||
|
hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
|
||||||
|
"""
|
||||||
|
|
||||||
|
h1, w1 = img.shape[:2]
|
||||||
|
img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
|
||||||
|
h, w = img.shape[:2]
|
||||||
|
|
||||||
|
if h < lq_patchsize * sf or w < lq_patchsize * sf:
|
||||||
|
raise ValueError(f'img size ({h1}X{w1}) is too small!')
|
||||||
|
|
||||||
|
if use_sharp:
|
||||||
|
img = add_sharpening(img)
|
||||||
|
hq = img.copy()
|
||||||
|
|
||||||
|
if random.random() < shuffle_prob:
|
||||||
|
shuffle_order = random.sample(range(13), 13)
|
||||||
|
else:
|
||||||
|
shuffle_order = list(range(13))
|
||||||
|
# local shuffle for noise, JPEG is always the last one
|
||||||
|
shuffle_order[2:6] = random.sample(shuffle_order[2:6], len(range(2, 6)))
|
||||||
|
shuffle_order[9:13] = random.sample(shuffle_order[9:13], len(range(9, 13)))
|
||||||
|
|
||||||
|
poisson_prob, speckle_prob, isp_prob = 0.1, 0.1, 0.1
|
||||||
|
|
||||||
|
for i in shuffle_order:
|
||||||
|
if i == 0:
|
||||||
|
img = add_blur(img, sf=sf)
|
||||||
|
elif i == 1:
|
||||||
|
img = add_resize(img, sf=sf)
|
||||||
|
elif i == 2:
|
||||||
|
img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
|
||||||
|
elif i == 3:
|
||||||
|
if random.random() < poisson_prob:
|
||||||
|
img = add_Poisson_noise(img)
|
||||||
|
elif i == 4:
|
||||||
|
if random.random() < speckle_prob:
|
||||||
|
img = add_speckle_noise(img)
|
||||||
|
elif i == 5:
|
||||||
|
if random.random() < isp_prob and isp_model is not None:
|
||||||
|
with torch.no_grad():
|
||||||
|
img, hq = isp_model.forward(img.copy(), hq)
|
||||||
|
elif i == 6:
|
||||||
|
img = add_JPEG_noise(img)
|
||||||
|
elif i == 7:
|
||||||
|
img = add_blur(img, sf=sf)
|
||||||
|
elif i == 8:
|
||||||
|
img = add_resize(img, sf=sf)
|
||||||
|
elif i == 9:
|
||||||
|
img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
|
||||||
|
elif i == 10:
|
||||||
|
if random.random() < poisson_prob:
|
||||||
|
img = add_Poisson_noise(img)
|
||||||
|
elif i == 11:
|
||||||
|
if random.random() < speckle_prob:
|
||||||
|
img = add_speckle_noise(img)
|
||||||
|
elif i == 12:
|
||||||
|
if random.random() < isp_prob and isp_model is not None:
|
||||||
|
with torch.no_grad():
|
||||||
|
img, hq = isp_model.forward(img.copy(), hq)
|
||||||
|
else:
|
||||||
|
print('check the shuffle!')
|
||||||
|
|
||||||
|
# resize to desired size
|
||||||
|
img = cv2.resize(img, (int(1 / sf * hq.shape[1]), int(1 / sf * hq.shape[0])),
|
||||||
|
interpolation=random.choice([1, 2, 3]))
|
||||||
|
|
||||||
|
# add final JPEG compression noise
|
||||||
|
img = add_JPEG_noise(img)
|
||||||
|
|
||||||
|
# random crop
|
||||||
|
img, hq = random_crop(img, hq, sf, lq_patchsize)
|
||||||
|
|
||||||
|
return img, hq
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print("hey")
|
||||||
|
img = util.imread_uint('utils/test.png', 3)
|
||||||
|
print(img)
|
||||||
|
img = util.uint2single(img)
|
||||||
|
print(img)
|
||||||
|
img = img[:448, :448]
|
||||||
|
h = img.shape[0] // 4
|
||||||
|
print("resizing to", h)
|
||||||
|
sf = 4
|
||||||
|
deg_fn = partial(degradation_bsrgan_variant, sf=sf)
|
||||||
|
for i in range(20):
|
||||||
|
print(i)
|
||||||
|
img_lq = deg_fn(img)
|
||||||
|
print(img_lq)
|
||||||
|
img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img)["image"]
|
||||||
|
print(img_lq.shape)
|
||||||
|
print("bicubic", img_lq_bicubic.shape)
|
||||||
|
print(img_hq.shape)
|
||||||
|
lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
|
||||||
|
interpolation=0)
|
||||||
|
lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
|
||||||
|
interpolation=0)
|
||||||
|
img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
|
||||||
|
util.imsave(img_concat, str(i) + '.png')
|
||||||
|
|
||||||
|
|
@ -0,0 +1,651 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import numpy as np
|
||||||
|
import cv2
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from functools import partial
|
||||||
|
import random
|
||||||
|
from scipy import ndimage
|
||||||
|
import scipy
|
||||||
|
import scipy.stats as ss
|
||||||
|
from scipy.interpolate import interp2d
|
||||||
|
from scipy.linalg import orth
|
||||||
|
import albumentations
|
||||||
|
|
||||||
|
from . import utils_image as util
|
||||||
|
|
||||||
|
"""
|
||||||
|
# --------------------------------------------
|
||||||
|
# Super-Resolution
|
||||||
|
# --------------------------------------------
|
||||||
|
#
|
||||||
|
# Kai Zhang (cskaizhang@gmail.com)
|
||||||
|
# https://github.com/cszn
|
||||||
|
# From 2019/03--2021/08
|
||||||
|
# --------------------------------------------
|
||||||
|
"""
|
||||||
|
|
||||||
|
def modcrop_np(img, sf):
|
||||||
|
'''
|
||||||
|
Args:
|
||||||
|
img: numpy image, WxH or WxHxC
|
||||||
|
sf: scale factor
|
||||||
|
Return:
|
||||||
|
cropped image
|
||||||
|
'''
|
||||||
|
w, h = img.shape[:2]
|
||||||
|
im = np.copy(img)
|
||||||
|
return im[:w - w % sf, :h - h % sf, ...]
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
# --------------------------------------------
|
||||||
|
# anisotropic Gaussian kernels
|
||||||
|
# --------------------------------------------
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def analytic_kernel(k):
|
||||||
|
"""Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
|
||||||
|
k_size = k.shape[0]
|
||||||
|
# Calculate the big kernels size
|
||||||
|
big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
|
||||||
|
# Loop over the small kernel to fill the big one
|
||||||
|
for r in range(k_size):
|
||||||
|
for c in range(k_size):
|
||||||
|
big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
|
||||||
|
# Crop the edges of the big kernel to ignore very small values and increase run time of SR
|
||||||
|
crop = k_size // 2
|
||||||
|
cropped_big_k = big_k[crop:-crop, crop:-crop]
|
||||||
|
# Normalize to 1
|
||||||
|
return cropped_big_k / cropped_big_k.sum()
|
||||||
|
|
||||||
|
|
||||||
|
def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
|
||||||
|
""" generate an anisotropic Gaussian kernel
|
||||||
|
Args:
|
||||||
|
ksize : e.g., 15, kernel size
|
||||||
|
theta : [0, pi], rotation angle range
|
||||||
|
l1 : [0.1,50], scaling of eigenvalues
|
||||||
|
l2 : [0.1,l1], scaling of eigenvalues
|
||||||
|
If l1 = l2, will get an isotropic Gaussian kernel.
|
||||||
|
Returns:
|
||||||
|
k : kernel
|
||||||
|
"""
|
||||||
|
|
||||||
|
v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
|
||||||
|
V = np.array([[v[0], v[1]], [v[1], -v[0]]])
|
||||||
|
D = np.array([[l1, 0], [0, l2]])
|
||||||
|
Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
|
||||||
|
k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
|
||||||
|
|
||||||
|
return k
|
||||||
|
|
||||||
|
|
||||||
|
def gm_blur_kernel(mean, cov, size=15):
|
||||||
|
center = size / 2.0 + 0.5
|
||||||
|
k = np.zeros([size, size])
|
||||||
|
for y in range(size):
|
||||||
|
for x in range(size):
|
||||||
|
cy = y - center + 1
|
||||||
|
cx = x - center + 1
|
||||||
|
k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
|
||||||
|
|
||||||
|
k = k / np.sum(k)
|
||||||
|
return k
|
||||||
|
|
||||||
|
|
||||||
|
def shift_pixel(x, sf, upper_left=True):
|
||||||
|
"""shift pixel for super-resolution with different scale factors
|
||||||
|
Args:
|
||||||
|
x: WxHxC or WxH
|
||||||
|
sf: scale factor
|
||||||
|
upper_left: shift direction
|
||||||
|
"""
|
||||||
|
h, w = x.shape[:2]
|
||||||
|
shift = (sf - 1) * 0.5
|
||||||
|
xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
|
||||||
|
if upper_left:
|
||||||
|
x1 = xv + shift
|
||||||
|
y1 = yv + shift
|
||||||
|
else:
|
||||||
|
x1 = xv - shift
|
||||||
|
y1 = yv - shift
|
||||||
|
|
||||||
|
x1 = np.clip(x1, 0, w - 1)
|
||||||
|
y1 = np.clip(y1, 0, h - 1)
|
||||||
|
|
||||||
|
if x.ndim == 2:
|
||||||
|
x = interp2d(xv, yv, x)(x1, y1)
|
||||||
|
if x.ndim == 3:
|
||||||
|
for i in range(x.shape[-1]):
|
||||||
|
x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def blur(x, k):
|
||||||
|
'''
|
||||||
|
x: image, NxcxHxW
|
||||||
|
k: kernel, Nx1xhxw
|
||||||
|
'''
|
||||||
|
n, c = x.shape[:2]
|
||||||
|
p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
|
||||||
|
x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
|
||||||
|
k = k.repeat(1, c, 1, 1)
|
||||||
|
k = k.view(-1, 1, k.shape[2], k.shape[3])
|
||||||
|
x = x.view(1, -1, x.shape[2], x.shape[3])
|
||||||
|
x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
|
||||||
|
x = x.view(n, c, x.shape[2], x.shape[3])
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
|
||||||
|
""""
|
||||||
|
# modified version of https://github.com/assafshocher/BlindSR_dataset_generator
|
||||||
|
# Kai Zhang
|
||||||
|
# min_var = 0.175 * sf # variance of the gaussian kernel will be sampled between min_var and max_var
|
||||||
|
# max_var = 2.5 * sf
|
||||||
|
"""
|
||||||
|
# Set random eigen-vals (lambdas) and angle (theta) for COV matrix
|
||||||
|
lambda_1 = min_var + np.random.rand() * (max_var - min_var)
|
||||||
|
lambda_2 = min_var + np.random.rand() * (max_var - min_var)
|
||||||
|
theta = np.random.rand() * np.pi # random theta
|
||||||
|
noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
|
||||||
|
|
||||||
|
# Set COV matrix using Lambdas and Theta
|
||||||
|
LAMBDA = np.diag([lambda_1, lambda_2])
|
||||||
|
Q = np.array([[np.cos(theta), -np.sin(theta)],
|
||||||
|
[np.sin(theta), np.cos(theta)]])
|
||||||
|
SIGMA = Q @ LAMBDA @ Q.T
|
||||||
|
INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
|
||||||
|
|
||||||
|
# Set expectation position (shifting kernel for aligned image)
|
||||||
|
MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2)
|
||||||
|
MU = MU[None, None, :, None]
|
||||||
|
|
||||||
|
# Create meshgrid for Gaussian
|
||||||
|
[X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
|
||||||
|
Z = np.stack([X, Y], 2)[:, :, :, None]
|
||||||
|
|
||||||
|
# Calcualte Gaussian for every pixel of the kernel
|
||||||
|
ZZ = Z - MU
|
||||||
|
ZZ_t = ZZ.transpose(0, 1, 3, 2)
|
||||||
|
raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
|
||||||
|
|
||||||
|
# shift the kernel so it will be centered
|
||||||
|
# raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
|
||||||
|
|
||||||
|
# Normalize the kernel and return
|
||||||
|
# kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
|
||||||
|
kernel = raw_kernel / np.sum(raw_kernel)
|
||||||
|
return kernel
|
||||||
|
|
||||||
|
|
||||||
|
def fspecial_gaussian(hsize, sigma):
|
||||||
|
hsize = [hsize, hsize]
|
||||||
|
siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
|
||||||
|
std = sigma
|
||||||
|
[x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
|
||||||
|
arg = -(x * x + y * y) / (2 * std * std)
|
||||||
|
h = np.exp(arg)
|
||||||
|
h[h < scipy.finfo(float).eps * h.max()] = 0
|
||||||
|
sumh = h.sum()
|
||||||
|
if sumh != 0:
|
||||||
|
h = h / sumh
|
||||||
|
return h
|
||||||
|
|
||||||
|
|
||||||
|
def fspecial_laplacian(alpha):
|
||||||
|
alpha = max([0, min([alpha, 1])])
|
||||||
|
h1 = alpha / (alpha + 1)
|
||||||
|
h2 = (1 - alpha) / (alpha + 1)
|
||||||
|
h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
|
||||||
|
h = np.array(h)
|
||||||
|
return h
|
||||||
|
|
||||||
|
|
||||||
|
def fspecial(filter_type, *args, **kwargs):
|
||||||
|
'''
|
||||||
|
python code from:
|
||||||
|
https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
|
||||||
|
'''
|
||||||
|
if filter_type == 'gaussian':
|
||||||
|
return fspecial_gaussian(*args, **kwargs)
|
||||||
|
if filter_type == 'laplacian':
|
||||||
|
return fspecial_laplacian(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
# --------------------------------------------
|
||||||
|
# degradation models
|
||||||
|
# --------------------------------------------
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def bicubic_degradation(x, sf=3):
|
||||||
|
'''
|
||||||
|
Args:
|
||||||
|
x: HxWxC image, [0, 1]
|
||||||
|
sf: down-scale factor
|
||||||
|
Return:
|
||||||
|
bicubicly downsampled LR image
|
||||||
|
'''
|
||||||
|
x = util.imresize_np(x, scale=1 / sf)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def srmd_degradation(x, k, sf=3):
|
||||||
|
''' blur + bicubic downsampling
|
||||||
|
Args:
|
||||||
|
x: HxWxC image, [0, 1]
|
||||||
|
k: hxw, double
|
||||||
|
sf: down-scale factor
|
||||||
|
Return:
|
||||||
|
downsampled LR image
|
||||||
|
Reference:
|
||||||
|
@inproceedings{zhang2018learning,
|
||||||
|
title={Learning a single convolutional super-resolution network for multiple degradations},
|
||||||
|
author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
|
||||||
|
booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
|
||||||
|
pages={3262--3271},
|
||||||
|
year={2018}
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
x = ndimage.convolve(x, np.expand_dims(k, axis=2), mode='wrap') # 'nearest' | 'mirror'
|
||||||
|
x = bicubic_degradation(x, sf=sf)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def dpsr_degradation(x, k, sf=3):
|
||||||
|
''' bicubic downsampling + blur
|
||||||
|
Args:
|
||||||
|
x: HxWxC image, [0, 1]
|
||||||
|
k: hxw, double
|
||||||
|
sf: down-scale factor
|
||||||
|
Return:
|
||||||
|
downsampled LR image
|
||||||
|
Reference:
|
||||||
|
@inproceedings{zhang2019deep,
|
||||||
|
title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
|
||||||
|
author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
|
||||||
|
booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
|
||||||
|
pages={1671--1681},
|
||||||
|
year={2019}
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
x = bicubic_degradation(x, sf=sf)
|
||||||
|
x = ndimage.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def classical_degradation(x, k, sf=3):
|
||||||
|
''' blur + downsampling
|
||||||
|
Args:
|
||||||
|
x: HxWxC image, [0, 1]/[0, 255]
|
||||||
|
k: hxw, double
|
||||||
|
sf: down-scale factor
|
||||||
|
Return:
|
||||||
|
downsampled LR image
|
||||||
|
'''
|
||||||
|
x = ndimage.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
|
||||||
|
# x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
|
||||||
|
st = 0
|
||||||
|
return x[st::sf, st::sf, ...]
|
||||||
|
|
||||||
|
|
||||||
|
def add_sharpening(img, weight=0.5, radius=50, threshold=10):
|
||||||
|
"""USM sharpening. borrowed from real-ESRGAN
|
||||||
|
Input image: I; Blurry image: B.
|
||||||
|
1. K = I + weight * (I - B)
|
||||||
|
2. Mask = 1 if abs(I - B) > threshold, else: 0
|
||||||
|
3. Blur mask:
|
||||||
|
4. Out = Mask * K + (1 - Mask) * I
|
||||||
|
Args:
|
||||||
|
img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
|
||||||
|
weight (float): Sharp weight. Default: 1.
|
||||||
|
radius (float): Kernel size of Gaussian blur. Default: 50.
|
||||||
|
threshold (int):
|
||||||
|
"""
|
||||||
|
if radius % 2 == 0:
|
||||||
|
radius += 1
|
||||||
|
blur = cv2.GaussianBlur(img, (radius, radius), 0)
|
||||||
|
residual = img - blur
|
||||||
|
mask = np.abs(residual) * 255 > threshold
|
||||||
|
mask = mask.astype('float32')
|
||||||
|
soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
|
||||||
|
|
||||||
|
K = img + weight * residual
|
||||||
|
K = np.clip(K, 0, 1)
|
||||||
|
return soft_mask * K + (1 - soft_mask) * img
|
||||||
|
|
||||||
|
|
||||||
|
def add_blur(img, sf=4):
|
||||||
|
wd2 = 4.0 + sf
|
||||||
|
wd = 2.0 + 0.2 * sf
|
||||||
|
|
||||||
|
wd2 = wd2/4
|
||||||
|
wd = wd/4
|
||||||
|
|
||||||
|
if random.random() < 0.5:
|
||||||
|
l1 = wd2 * random.random()
|
||||||
|
l2 = wd2 * random.random()
|
||||||
|
k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
|
||||||
|
else:
|
||||||
|
k = fspecial('gaussian', random.randint(2, 4) + 3, wd * random.random())
|
||||||
|
img = ndimage.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
|
||||||
|
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def add_resize(img, sf=4):
|
||||||
|
rnum = np.random.rand()
|
||||||
|
if rnum > 0.8: # up
|
||||||
|
sf1 = random.uniform(1, 2)
|
||||||
|
elif rnum < 0.7: # down
|
||||||
|
sf1 = random.uniform(0.5 / sf, 1)
|
||||||
|
else:
|
||||||
|
sf1 = 1.0
|
||||||
|
img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
# def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
|
||||||
|
# noise_level = random.randint(noise_level1, noise_level2)
|
||||||
|
# rnum = np.random.rand()
|
||||||
|
# if rnum > 0.6: # add color Gaussian noise
|
||||||
|
# img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
|
||||||
|
# elif rnum < 0.4: # add grayscale Gaussian noise
|
||||||
|
# img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
|
||||||
|
# else: # add noise
|
||||||
|
# L = noise_level2 / 255.
|
||||||
|
# D = np.diag(np.random.rand(3))
|
||||||
|
# U = orth(np.random.rand(3, 3))
|
||||||
|
# conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||||
|
# img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||||
|
# img = np.clip(img, 0.0, 1.0)
|
||||||
|
# return img
|
||||||
|
|
||||||
|
def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
|
||||||
|
noise_level = random.randint(noise_level1, noise_level2)
|
||||||
|
rnum = np.random.rand()
|
||||||
|
if rnum > 0.6: # add color Gaussian noise
|
||||||
|
img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
|
||||||
|
elif rnum < 0.4: # add grayscale Gaussian noise
|
||||||
|
img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
|
||||||
|
else: # add noise
|
||||||
|
L = noise_level2 / 255.
|
||||||
|
D = np.diag(np.random.rand(3))
|
||||||
|
U = orth(np.random.rand(3, 3))
|
||||||
|
conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||||
|
img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def add_speckle_noise(img, noise_level1=2, noise_level2=25):
|
||||||
|
noise_level = random.randint(noise_level1, noise_level2)
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
rnum = random.random()
|
||||||
|
if rnum > 0.6:
|
||||||
|
img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
|
||||||
|
elif rnum < 0.4:
|
||||||
|
img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
|
||||||
|
else:
|
||||||
|
L = noise_level2 / 255.
|
||||||
|
D = np.diag(np.random.rand(3))
|
||||||
|
U = orth(np.random.rand(3, 3))
|
||||||
|
conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||||
|
img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def add_Poisson_noise(img):
|
||||||
|
img = np.clip((img * 255.0).round(), 0, 255) / 255.
|
||||||
|
vals = 10 ** (2 * random.random() + 2.0) # [2, 4]
|
||||||
|
if random.random() < 0.5:
|
||||||
|
img = np.random.poisson(img * vals).astype(np.float32) / vals
|
||||||
|
else:
|
||||||
|
img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
|
||||||
|
img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
|
||||||
|
noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
|
||||||
|
img += noise_gray[:, :, np.newaxis]
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def add_JPEG_noise(img):
|
||||||
|
quality_factor = random.randint(80, 95)
|
||||||
|
img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
|
||||||
|
result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
|
||||||
|
img = cv2.imdecode(encimg, 1)
|
||||||
|
img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def random_crop(lq, hq, sf=4, lq_patchsize=64):
|
||||||
|
h, w = lq.shape[:2]
|
||||||
|
rnd_h = random.randint(0, h - lq_patchsize)
|
||||||
|
rnd_w = random.randint(0, w - lq_patchsize)
|
||||||
|
lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
|
||||||
|
|
||||||
|
rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
|
||||||
|
hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
|
||||||
|
return lq, hq
|
||||||
|
|
||||||
|
|
||||||
|
def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
|
||||||
|
"""
|
||||||
|
This is the degradation model of BSRGAN from the paper
|
||||||
|
"Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
|
||||||
|
----------
|
||||||
|
img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
|
||||||
|
sf: scale factor
|
||||||
|
isp_model: camera ISP model
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
|
||||||
|
hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
|
||||||
|
"""
|
||||||
|
isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
|
||||||
|
sf_ori = sf
|
||||||
|
|
||||||
|
h1, w1 = img.shape[:2]
|
||||||
|
img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
|
||||||
|
h, w = img.shape[:2]
|
||||||
|
|
||||||
|
if h < lq_patchsize * sf or w < lq_patchsize * sf:
|
||||||
|
raise ValueError(f'img size ({h1}X{w1}) is too small!')
|
||||||
|
|
||||||
|
hq = img.copy()
|
||||||
|
|
||||||
|
if sf == 4 and random.random() < scale2_prob: # downsample1
|
||||||
|
if np.random.rand() < 0.5:
|
||||||
|
img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
|
||||||
|
interpolation=random.choice([1, 2, 3]))
|
||||||
|
else:
|
||||||
|
img = util.imresize_np(img, 1 / 2, True)
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
sf = 2
|
||||||
|
|
||||||
|
shuffle_order = random.sample(range(7), 7)
|
||||||
|
idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
|
||||||
|
if idx1 > idx2: # keep downsample3 last
|
||||||
|
shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
|
||||||
|
|
||||||
|
for i in shuffle_order:
|
||||||
|
|
||||||
|
if i == 0:
|
||||||
|
img = add_blur(img, sf=sf)
|
||||||
|
|
||||||
|
elif i == 1:
|
||||||
|
img = add_blur(img, sf=sf)
|
||||||
|
|
||||||
|
elif i == 2:
|
||||||
|
a, b = img.shape[1], img.shape[0]
|
||||||
|
# downsample2
|
||||||
|
if random.random() < 0.75:
|
||||||
|
sf1 = random.uniform(1, 2 * sf)
|
||||||
|
img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
|
||||||
|
interpolation=random.choice([1, 2, 3]))
|
||||||
|
else:
|
||||||
|
k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
|
||||||
|
k_shifted = shift_pixel(k, sf)
|
||||||
|
k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
|
||||||
|
img = ndimage.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
|
||||||
|
img = img[0::sf, 0::sf, ...] # nearest downsampling
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
|
||||||
|
elif i == 3:
|
||||||
|
# downsample3
|
||||||
|
img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
|
||||||
|
img = np.clip(img, 0.0, 1.0)
|
||||||
|
|
||||||
|
elif i == 4:
|
||||||
|
# add Gaussian noise
|
||||||
|
img = add_Gaussian_noise(img, noise_level1=2, noise_level2=8)
|
||||||
|
|
||||||
|
elif i == 5:
|
||||||
|
# add JPEG noise
|
||||||
|
if random.random() < jpeg_prob:
|
||||||
|
img = add_JPEG_noise(img)
|
||||||
|
|
||||||
|
elif i == 6:
|
||||||
|
# add processed camera sensor noise
|
||||||
|
if random.random() < isp_prob and isp_model is not None:
|
||||||
|
with torch.no_grad():
|
||||||
|
img, hq = isp_model.forward(img.copy(), hq)
|
||||||
|
|
||||||
|
# add final JPEG compression noise
|
||||||
|
img = add_JPEG_noise(img)
|
||||||
|
|
||||||
|
# random crop
|
||||||
|
img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
|
||||||
|
|
||||||
|
return img, hq
|
||||||
|
|
||||||
|
|
||||||
|
# todo no isp_model?
|
||||||
|
def degradation_bsrgan_variant(image, sf=4, isp_model=None, up=False):
|
||||||
|
"""
|
||||||
|
This is the degradation model of BSRGAN from the paper
|
||||||
|
"Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
|
||||||
|
----------
|
||||||
|
sf: scale factor
|
||||||
|
isp_model: camera ISP model
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
|
||||||
|
hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
|
||||||
|
"""
|
||||||
|
image = util.uint2single(image)
|
||||||
|
isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
|
||||||
|
sf_ori = sf
|
||||||
|
|
||||||
|
h1, w1 = image.shape[:2]
|
||||||
|
image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
|
||||||
|
h, w = image.shape[:2]
|
||||||
|
|
||||||
|
hq = image.copy()
|
||||||
|
|
||||||
|
if sf == 4 and random.random() < scale2_prob: # downsample1
|
||||||
|
if np.random.rand() < 0.5:
|
||||||
|
image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
|
||||||
|
interpolation=random.choice([1, 2, 3]))
|
||||||
|
else:
|
||||||
|
image = util.imresize_np(image, 1 / 2, True)
|
||||||
|
image = np.clip(image, 0.0, 1.0)
|
||||||
|
sf = 2
|
||||||
|
|
||||||
|
shuffle_order = random.sample(range(7), 7)
|
||||||
|
idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
|
||||||
|
if idx1 > idx2: # keep downsample3 last
|
||||||
|
shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
|
||||||
|
|
||||||
|
for i in shuffle_order:
|
||||||
|
|
||||||
|
if i == 0:
|
||||||
|
image = add_blur(image, sf=sf)
|
||||||
|
|
||||||
|
# elif i == 1:
|
||||||
|
# image = add_blur(image, sf=sf)
|
||||||
|
|
||||||
|
if i == 0:
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif i == 2:
|
||||||
|
a, b = image.shape[1], image.shape[0]
|
||||||
|
# downsample2
|
||||||
|
if random.random() < 0.8:
|
||||||
|
sf1 = random.uniform(1, 2 * sf)
|
||||||
|
image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
|
||||||
|
interpolation=random.choice([1, 2, 3]))
|
||||||
|
else:
|
||||||
|
k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
|
||||||
|
k_shifted = shift_pixel(k, sf)
|
||||||
|
k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
|
||||||
|
image = ndimage.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
|
||||||
|
image = image[0::sf, 0::sf, ...] # nearest downsampling
|
||||||
|
|
||||||
|
image = np.clip(image, 0.0, 1.0)
|
||||||
|
|
||||||
|
elif i == 3:
|
||||||
|
# downsample3
|
||||||
|
image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
|
||||||
|
image = np.clip(image, 0.0, 1.0)
|
||||||
|
|
||||||
|
elif i == 4:
|
||||||
|
# add Gaussian noise
|
||||||
|
image = add_Gaussian_noise(image, noise_level1=1, noise_level2=2)
|
||||||
|
|
||||||
|
elif i == 5:
|
||||||
|
# add JPEG noise
|
||||||
|
if random.random() < jpeg_prob:
|
||||||
|
image = add_JPEG_noise(image)
|
||||||
|
#
|
||||||
|
# elif i == 6:
|
||||||
|
# # add processed camera sensor noise
|
||||||
|
# if random.random() < isp_prob and isp_model is not None:
|
||||||
|
# with torch.no_grad():
|
||||||
|
# img, hq = isp_model.forward(img.copy(), hq)
|
||||||
|
|
||||||
|
# add final JPEG compression noise
|
||||||
|
image = add_JPEG_noise(image)
|
||||||
|
image = util.single2uint(image)
|
||||||
|
if up:
|
||||||
|
image = cv2.resize(image, (w1, h1), interpolation=cv2.INTER_CUBIC) # todo: random, as above? want to condition on it then
|
||||||
|
example = {"image": image}
|
||||||
|
return example
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print("hey")
|
||||||
|
img = util.imread_uint('utils/test.png', 3)
|
||||||
|
img = img[:448, :448]
|
||||||
|
h = img.shape[0] // 4
|
||||||
|
print("resizing to", h)
|
||||||
|
sf = 4
|
||||||
|
deg_fn = partial(degradation_bsrgan_variant, sf=sf)
|
||||||
|
for i in range(20):
|
||||||
|
print(i)
|
||||||
|
img_hq = img
|
||||||
|
img_lq = deg_fn(img)["image"]
|
||||||
|
img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
|
||||||
|
print(img_lq)
|
||||||
|
img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"]
|
||||||
|
print(img_lq.shape)
|
||||||
|
print("bicubic", img_lq_bicubic.shape)
|
||||||
|
print(img_hq.shape)
|
||||||
|
lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
|
||||||
|
interpolation=0)
|
||||||
|
lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic),
|
||||||
|
(int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
|
||||||
|
interpolation=0)
|
||||||
|
img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
|
||||||
|
util.imsave(img_concat, str(i) + '.png')
|
After Width: | Height: | Size: 431 KiB |
@ -0,0 +1,916 @@
|
|||||||
|
import os
|
||||||
|
import math
|
||||||
|
import random
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import cv2
|
||||||
|
from torchvision.utils import make_grid
|
||||||
|
from datetime import datetime
|
||||||
|
#import matplotlib.pyplot as plt # TODO: check with Dominik, also bsrgan.py vs bsrgan_light.py
|
||||||
|
|
||||||
|
|
||||||
|
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
# --------------------------------------------
|
||||||
|
# Kai Zhang (github: https://github.com/cszn)
|
||||||
|
# 03/Mar/2019
|
||||||
|
# --------------------------------------------
|
||||||
|
# https://github.com/twhui/SRGAN-pyTorch
|
||||||
|
# https://github.com/xinntao/BasicSR
|
||||||
|
# --------------------------------------------
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', '.tif']
|
||||||
|
|
||||||
|
|
||||||
|
def is_image_file(filename):
|
||||||
|
return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
|
||||||
|
|
||||||
|
|
||||||
|
def get_timestamp():
|
||||||
|
return datetime.now().strftime('%y%m%d-%H%M%S')
|
||||||
|
|
||||||
|
|
||||||
|
def imshow(x, title=None, cbar=False, figsize=None):
|
||||||
|
plt.figure(figsize=figsize)
|
||||||
|
plt.imshow(np.squeeze(x), interpolation='nearest', cmap='gray')
|
||||||
|
if title:
|
||||||
|
plt.title(title)
|
||||||
|
if cbar:
|
||||||
|
plt.colorbar()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
def surf(Z, cmap='rainbow', figsize=None):
|
||||||
|
plt.figure(figsize=figsize)
|
||||||
|
ax3 = plt.axes(projection='3d')
|
||||||
|
|
||||||
|
w, h = Z.shape[:2]
|
||||||
|
xx = np.arange(0,w,1)
|
||||||
|
yy = np.arange(0,h,1)
|
||||||
|
X, Y = np.meshgrid(xx, yy)
|
||||||
|
ax3.plot_surface(X,Y,Z,cmap=cmap)
|
||||||
|
#ax3.contour(X,Y,Z, zdim='z',offset=-2,cmap=cmap)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
# --------------------------------------------
|
||||||
|
# get image pathes
|
||||||
|
# --------------------------------------------
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def get_image_paths(dataroot):
|
||||||
|
paths = None # return None if dataroot is None
|
||||||
|
if dataroot is not None:
|
||||||
|
paths = sorted(_get_paths_from_images(dataroot))
|
||||||
|
return paths
|
||||||
|
|
||||||
|
|
||||||
|
def _get_paths_from_images(path):
|
||||||
|
assert os.path.isdir(path), '{:s} is not a valid directory'.format(path)
|
||||||
|
images = []
|
||||||
|
for dirpath, _, fnames in sorted(os.walk(path)):
|
||||||
|
for fname in sorted(fnames):
|
||||||
|
if is_image_file(fname):
|
||||||
|
img_path = os.path.join(dirpath, fname)
|
||||||
|
images.append(img_path)
|
||||||
|
assert images, '{:s} has no valid image file'.format(path)
|
||||||
|
return images
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
# --------------------------------------------
|
||||||
|
# split large images into small images
|
||||||
|
# --------------------------------------------
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def patches_from_image(img, p_size=512, p_overlap=64, p_max=800):
|
||||||
|
w, h = img.shape[:2]
|
||||||
|
patches = []
|
||||||
|
if w > p_max and h > p_max:
|
||||||
|
w1 = list(np.arange(0, w-p_size, p_size-p_overlap, dtype=np.int))
|
||||||
|
h1 = list(np.arange(0, h-p_size, p_size-p_overlap, dtype=np.int))
|
||||||
|
w1.append(w-p_size)
|
||||||
|
h1.append(h-p_size)
|
||||||
|
# print(w1)
|
||||||
|
# print(h1)
|
||||||
|
for i in w1:
|
||||||
|
for j in h1:
|
||||||
|
patches.append(img[i:i+p_size, j:j+p_size,:])
|
||||||
|
else:
|
||||||
|
patches.append(img)
|
||||||
|
|
||||||
|
return patches
|
||||||
|
|
||||||
|
|
||||||
|
def imssave(imgs, img_path):
|
||||||
|
"""
|
||||||
|
imgs: list, N images of size WxHxC
|
||||||
|
"""
|
||||||
|
img_name, ext = os.path.splitext(os.path.basename(img_path))
|
||||||
|
|
||||||
|
for i, img in enumerate(imgs):
|
||||||
|
if img.ndim == 3:
|
||||||
|
img = img[:, :, [2, 1, 0]]
|
||||||
|
new_path = os.path.join(os.path.dirname(img_path), img_name+str('_s{:04d}'.format(i))+'.png')
|
||||||
|
cv2.imwrite(new_path, img)
|
||||||
|
|
||||||
|
|
||||||
|
def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800, p_overlap=96, p_max=1000):
|
||||||
|
"""
|
||||||
|
split the large images from original_dataroot into small overlapped images with size (p_size)x(p_size),
|
||||||
|
and save them into taget_dataroot; only the images with larger size than (p_max)x(p_max)
|
||||||
|
will be splitted.
|
||||||
|
Args:
|
||||||
|
original_dataroot:
|
||||||
|
taget_dataroot:
|
||||||
|
p_size: size of small images
|
||||||
|
p_overlap: patch size in training is a good choice
|
||||||
|
p_max: images with smaller size than (p_max)x(p_max) keep unchanged.
|
||||||
|
"""
|
||||||
|
paths = get_image_paths(original_dataroot)
|
||||||
|
for img_path in paths:
|
||||||
|
# img_name, ext = os.path.splitext(os.path.basename(img_path))
|
||||||
|
img = imread_uint(img_path, n_channels=n_channels)
|
||||||
|
patches = patches_from_image(img, p_size, p_overlap, p_max)
|
||||||
|
imssave(patches, os.path.join(taget_dataroot,os.path.basename(img_path)))
|
||||||
|
#if original_dataroot == taget_dataroot:
|
||||||
|
#del img_path
|
||||||
|
|
||||||
|
'''
|
||||||
|
# --------------------------------------------
|
||||||
|
# makedir
|
||||||
|
# --------------------------------------------
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def mkdir(path):
|
||||||
|
if not os.path.exists(path):
|
||||||
|
os.makedirs(path)
|
||||||
|
|
||||||
|
|
||||||
|
def mkdirs(paths):
|
||||||
|
if isinstance(paths, str):
|
||||||
|
mkdir(paths)
|
||||||
|
else:
|
||||||
|
for path in paths:
|
||||||
|
mkdir(path)
|
||||||
|
|
||||||
|
|
||||||
|
def mkdir_and_rename(path):
|
||||||
|
if os.path.exists(path):
|
||||||
|
new_name = path + '_archived_' + get_timestamp()
|
||||||
|
print('Path already exists. Rename it to [{:s}]'.format(new_name))
|
||||||
|
os.rename(path, new_name)
|
||||||
|
os.makedirs(path)
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
# --------------------------------------------
|
||||||
|
# read image from path
|
||||||
|
# opencv is fast, but read BGR numpy image
|
||||||
|
# --------------------------------------------
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------
|
||||||
|
# get uint8 image of size HxWxn_channles (RGB)
|
||||||
|
# --------------------------------------------
|
||||||
|
def imread_uint(path, n_channels=3):
|
||||||
|
# input: path
|
||||||
|
# output: HxWx3(RGB or GGG), or HxWx1 (G)
|
||||||
|
if n_channels == 1:
|
||||||
|
img = cv2.imread(path, 0) # cv2.IMREAD_GRAYSCALE
|
||||||
|
img = np.expand_dims(img, axis=2) # HxWx1
|
||||||
|
elif n_channels == 3:
|
||||||
|
img = cv2.imread(path, cv2.IMREAD_UNCHANGED) # BGR or G
|
||||||
|
if img.ndim == 2:
|
||||||
|
img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) # GGG
|
||||||
|
else:
|
||||||
|
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # RGB
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------
|
||||||
|
# matlab's imwrite
|
||||||
|
# --------------------------------------------
|
||||||
|
def imsave(img, img_path):
|
||||||
|
img = np.squeeze(img)
|
||||||
|
if img.ndim == 3:
|
||||||
|
img = img[:, :, [2, 1, 0]]
|
||||||
|
cv2.imwrite(img_path, img)
|
||||||
|
|
||||||
|
def imwrite(img, img_path):
|
||||||
|
img = np.squeeze(img)
|
||||||
|
if img.ndim == 3:
|
||||||
|
img = img[:, :, [2, 1, 0]]
|
||||||
|
cv2.imwrite(img_path, img)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------
|
||||||
|
# get single image of size HxWxn_channles (BGR)
|
||||||
|
# --------------------------------------------
|
||||||
|
def read_img(path):
|
||||||
|
# read image by cv2
|
||||||
|
# return: Numpy float32, HWC, BGR, [0,1]
|
||||||
|
img = cv2.imread(path, cv2.IMREAD_UNCHANGED) # cv2.IMREAD_GRAYSCALE
|
||||||
|
img = img.astype(np.float32) / 255.
|
||||||
|
if img.ndim == 2:
|
||||||
|
img = np.expand_dims(img, axis=2)
|
||||||
|
# some images have 4 channels
|
||||||
|
if img.shape[2] > 3:
|
||||||
|
img = img[:, :, :3]
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
# --------------------------------------------
|
||||||
|
# image format conversion
|
||||||
|
# --------------------------------------------
|
||||||
|
# numpy(single) <---> numpy(unit)
|
||||||
|
# numpy(single) <---> tensor
|
||||||
|
# numpy(unit) <---> tensor
|
||||||
|
# --------------------------------------------
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------
|
||||||
|
# numpy(single) [0, 1] <---> numpy(unit)
|
||||||
|
# --------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def uint2single(img):
|
||||||
|
|
||||||
|
return np.float32(img/255.)
|
||||||
|
|
||||||
|
|
||||||
|
def single2uint(img):
|
||||||
|
|
||||||
|
return np.uint8((img.clip(0, 1)*255.).round())
|
||||||
|
|
||||||
|
|
||||||
|
def uint162single(img):
|
||||||
|
|
||||||
|
return np.float32(img/65535.)
|
||||||
|
|
||||||
|
|
||||||
|
def single2uint16(img):
|
||||||
|
|
||||||
|
return np.uint16((img.clip(0, 1)*65535.).round())
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------
|
||||||
|
# numpy(unit) (HxWxC or HxW) <---> tensor
|
||||||
|
# --------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
# convert uint to 4-dimensional torch tensor
|
||||||
|
def uint2tensor4(img):
|
||||||
|
if img.ndim == 2:
|
||||||
|
img = np.expand_dims(img, axis=2)
|
||||||
|
return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.).unsqueeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
# convert uint to 3-dimensional torch tensor
|
||||||
|
def uint2tensor3(img):
|
||||||
|
if img.ndim == 2:
|
||||||
|
img = np.expand_dims(img, axis=2)
|
||||||
|
return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.)
|
||||||
|
|
||||||
|
|
||||||
|
# convert 2/3/4-dimensional torch tensor to uint
|
||||||
|
def tensor2uint(img):
|
||||||
|
img = img.data.squeeze().float().clamp_(0, 1).cpu().numpy()
|
||||||
|
if img.ndim == 3:
|
||||||
|
img = np.transpose(img, (1, 2, 0))
|
||||||
|
return np.uint8((img*255.0).round())
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------
|
||||||
|
# numpy(single) (HxWxC) <---> tensor
|
||||||
|
# --------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
# convert single (HxWxC) to 3-dimensional torch tensor
|
||||||
|
def single2tensor3(img):
|
||||||
|
return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float()
|
||||||
|
|
||||||
|
|
||||||
|
# convert single (HxWxC) to 4-dimensional torch tensor
|
||||||
|
def single2tensor4(img):
|
||||||
|
return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().unsqueeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
# convert torch tensor to single
|
||||||
|
def tensor2single(img):
|
||||||
|
img = img.data.squeeze().float().cpu().numpy()
|
||||||
|
if img.ndim == 3:
|
||||||
|
img = np.transpose(img, (1, 2, 0))
|
||||||
|
|
||||||
|
return img
|
||||||
|
|
||||||
|
# convert torch tensor to single
|
||||||
|
def tensor2single3(img):
|
||||||
|
img = img.data.squeeze().float().cpu().numpy()
|
||||||
|
if img.ndim == 3:
|
||||||
|
img = np.transpose(img, (1, 2, 0))
|
||||||
|
elif img.ndim == 2:
|
||||||
|
img = np.expand_dims(img, axis=2)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def single2tensor5(img):
|
||||||
|
return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float().unsqueeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
def single32tensor5(img):
|
||||||
|
return torch.from_numpy(np.ascontiguousarray(img)).float().unsqueeze(0).unsqueeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
def single42tensor4(img):
|
||||||
|
return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float()
|
||||||
|
|
||||||
|
|
||||||
|
# from skimage.io import imread, imsave
|
||||||
|
def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
|
||||||
|
'''
|
||||||
|
Converts a torch Tensor into an image Numpy array of BGR channel order
|
||||||
|
Input: 4D(B,(3/1),H,W), 3D(C,H,W), or 2D(H,W), any range, RGB channel order
|
||||||
|
Output: 3D(H,W,C) or 2D(H,W), [0,255], np.uint8 (default)
|
||||||
|
'''
|
||||||
|
tensor = tensor.squeeze().float().cpu().clamp_(*min_max) # squeeze first, then clamp
|
||||||
|
tensor = (tensor - min_max[0]) / (min_max[1] - min_max[0]) # to range [0,1]
|
||||||
|
n_dim = tensor.dim()
|
||||||
|
if n_dim == 4:
|
||||||
|
n_img = len(tensor)
|
||||||
|
img_np = make_grid(tensor, nrow=int(math.sqrt(n_img)), normalize=False).numpy()
|
||||||
|
img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0)) # HWC, BGR
|
||||||
|
elif n_dim == 3:
|
||||||
|
img_np = tensor.numpy()
|
||||||
|
img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0)) # HWC, BGR
|
||||||
|
elif n_dim == 2:
|
||||||
|
img_np = tensor.numpy()
|
||||||
|
else:
|
||||||
|
raise TypeError(
|
||||||
|
'Only support 4D, 3D and 2D tensor. But received with dimension: {:d}'.format(n_dim))
|
||||||
|
if out_type == np.uint8:
|
||||||
|
img_np = (img_np * 255.0).round()
|
||||||
|
# Important. Unlike matlab, numpy.unit8() WILL NOT round by default.
|
||||||
|
return img_np.astype(out_type)
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
# --------------------------------------------
|
||||||
|
# Augmentation, flipe and/or rotate
|
||||||
|
# --------------------------------------------
|
||||||
|
# The following two are enough.
|
||||||
|
# (1) augmet_img: numpy image of WxHxC or WxH
|
||||||
|
# (2) augment_img_tensor4: tensor image 1xCxWxH
|
||||||
|
# --------------------------------------------
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def augment_img(img, mode=0):
|
||||||
|
'''Kai Zhang (github: https://github.com/cszn)
|
||||||
|
'''
|
||||||
|
if mode == 0:
|
||||||
|
return img
|
||||||
|
elif mode == 1:
|
||||||
|
return np.flipud(np.rot90(img))
|
||||||
|
elif mode == 2:
|
||||||
|
return np.flipud(img)
|
||||||
|
elif mode == 3:
|
||||||
|
return np.rot90(img, k=3)
|
||||||
|
elif mode == 4:
|
||||||
|
return np.flipud(np.rot90(img, k=2))
|
||||||
|
elif mode == 5:
|
||||||
|
return np.rot90(img)
|
||||||
|
elif mode == 6:
|
||||||
|
return np.rot90(img, k=2)
|
||||||
|
elif mode == 7:
|
||||||
|
return np.flipud(np.rot90(img, k=3))
|
||||||
|
|
||||||
|
|
||||||
|
def augment_img_tensor4(img, mode=0):
|
||||||
|
'''Kai Zhang (github: https://github.com/cszn)
|
||||||
|
'''
|
||||||
|
if mode == 0:
|
||||||
|
return img
|
||||||
|
elif mode == 1:
|
||||||
|
return img.rot90(1, [2, 3]).flip([2])
|
||||||
|
elif mode == 2:
|
||||||
|
return img.flip([2])
|
||||||
|
elif mode == 3:
|
||||||
|
return img.rot90(3, [2, 3])
|
||||||
|
elif mode == 4:
|
||||||
|
return img.rot90(2, [2, 3]).flip([2])
|
||||||
|
elif mode == 5:
|
||||||
|
return img.rot90(1, [2, 3])
|
||||||
|
elif mode == 6:
|
||||||
|
return img.rot90(2, [2, 3])
|
||||||
|
elif mode == 7:
|
||||||
|
return img.rot90(3, [2, 3]).flip([2])
|
||||||
|
|
||||||
|
|
||||||
|
def augment_img_tensor(img, mode=0):
|
||||||
|
'''Kai Zhang (github: https://github.com/cszn)
|
||||||
|
'''
|
||||||
|
img_size = img.size()
|
||||||
|
img_np = img.data.cpu().numpy()
|
||||||
|
if len(img_size) == 3:
|
||||||
|
img_np = np.transpose(img_np, (1, 2, 0))
|
||||||
|
elif len(img_size) == 4:
|
||||||
|
img_np = np.transpose(img_np, (2, 3, 1, 0))
|
||||||
|
img_np = augment_img(img_np, mode=mode)
|
||||||
|
img_tensor = torch.from_numpy(np.ascontiguousarray(img_np))
|
||||||
|
if len(img_size) == 3:
|
||||||
|
img_tensor = img_tensor.permute(2, 0, 1)
|
||||||
|
elif len(img_size) == 4:
|
||||||
|
img_tensor = img_tensor.permute(3, 2, 0, 1)
|
||||||
|
|
||||||
|
return img_tensor.type_as(img)
|
||||||
|
|
||||||
|
|
||||||
|
def augment_img_np3(img, mode=0):
|
||||||
|
if mode == 0:
|
||||||
|
return img
|
||||||
|
elif mode == 1:
|
||||||
|
return img.transpose(1, 0, 2)
|
||||||
|
elif mode == 2:
|
||||||
|
return img[::-1, :, :]
|
||||||
|
elif mode == 3:
|
||||||
|
img = img[::-1, :, :]
|
||||||
|
img = img.transpose(1, 0, 2)
|
||||||
|
return img
|
||||||
|
elif mode == 4:
|
||||||
|
return img[:, ::-1, :]
|
||||||
|
elif mode == 5:
|
||||||
|
img = img[:, ::-1, :]
|
||||||
|
img = img.transpose(1, 0, 2)
|
||||||
|
return img
|
||||||
|
elif mode == 6:
|
||||||
|
img = img[:, ::-1, :]
|
||||||
|
img = img[::-1, :, :]
|
||||||
|
return img
|
||||||
|
elif mode == 7:
|
||||||
|
img = img[:, ::-1, :]
|
||||||
|
img = img[::-1, :, :]
|
||||||
|
img = img.transpose(1, 0, 2)
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def augment_imgs(img_list, hflip=True, rot=True):
|
||||||
|
# horizontal flip OR rotate
|
||||||
|
hflip = hflip and random.random() < 0.5
|
||||||
|
vflip = rot and random.random() < 0.5
|
||||||
|
rot90 = rot and random.random() < 0.5
|
||||||
|
|
||||||
|
def _augment(img):
|
||||||
|
if hflip:
|
||||||
|
img = img[:, ::-1, :]
|
||||||
|
if vflip:
|
||||||
|
img = img[::-1, :, :]
|
||||||
|
if rot90:
|
||||||
|
img = img.transpose(1, 0, 2)
|
||||||
|
return img
|
||||||
|
|
||||||
|
return [_augment(img) for img in img_list]
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
# --------------------------------------------
|
||||||
|
# modcrop and shave
|
||||||
|
# --------------------------------------------
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def modcrop(img_in, scale):
|
||||||
|
# img_in: Numpy, HWC or HW
|
||||||
|
img = np.copy(img_in)
|
||||||
|
if img.ndim == 2:
|
||||||
|
H, W = img.shape
|
||||||
|
H_r, W_r = H % scale, W % scale
|
||||||
|
img = img[:H - H_r, :W - W_r]
|
||||||
|
elif img.ndim == 3:
|
||||||
|
H, W, C = img.shape
|
||||||
|
H_r, W_r = H % scale, W % scale
|
||||||
|
img = img[:H - H_r, :W - W_r, :]
|
||||||
|
else:
|
||||||
|
raise ValueError('Wrong img ndim: [{:d}].'.format(img.ndim))
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def shave(img_in, border=0):
|
||||||
|
# img_in: Numpy, HWC or HW
|
||||||
|
img = np.copy(img_in)
|
||||||
|
h, w = img.shape[:2]
|
||||||
|
img = img[border:h-border, border:w-border]
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
# --------------------------------------------
|
||||||
|
# image processing process on numpy image
|
||||||
|
# channel_convert(in_c, tar_type, img_list):
|
||||||
|
# rgb2ycbcr(img, only_y=True):
|
||||||
|
# bgr2ycbcr(img, only_y=True):
|
||||||
|
# ycbcr2rgb(img):
|
||||||
|
# --------------------------------------------
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def rgb2ycbcr(img, only_y=True):
|
||||||
|
'''same as matlab rgb2ycbcr
|
||||||
|
only_y: only return Y channel
|
||||||
|
Input:
|
||||||
|
uint8, [0, 255]
|
||||||
|
float, [0, 1]
|
||||||
|
'''
|
||||||
|
in_img_type = img.dtype
|
||||||
|
img.astype(np.float32)
|
||||||
|
if in_img_type != np.uint8:
|
||||||
|
img *= 255.
|
||||||
|
# convert
|
||||||
|
if only_y:
|
||||||
|
rlt = np.dot(img, [65.481, 128.553, 24.966]) / 255.0 + 16.0
|
||||||
|
else:
|
||||||
|
rlt = np.matmul(img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
|
||||||
|
[24.966, 112.0, -18.214]]) / 255.0 + [16, 128, 128]
|
||||||
|
if in_img_type == np.uint8:
|
||||||
|
rlt = rlt.round()
|
||||||
|
else:
|
||||||
|
rlt /= 255.
|
||||||
|
return rlt.astype(in_img_type)
|
||||||
|
|
||||||
|
|
||||||
|
def ycbcr2rgb(img):
|
||||||
|
'''same as matlab ycbcr2rgb
|
||||||
|
Input:
|
||||||
|
uint8, [0, 255]
|
||||||
|
float, [0, 1]
|
||||||
|
'''
|
||||||
|
in_img_type = img.dtype
|
||||||
|
img.astype(np.float32)
|
||||||
|
if in_img_type != np.uint8:
|
||||||
|
img *= 255.
|
||||||
|
# convert
|
||||||
|
rlt = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0, -0.00153632, 0.00791071],
|
||||||
|
[0.00625893, -0.00318811, 0]]) * 255.0 + [-222.921, 135.576, -276.836]
|
||||||
|
if in_img_type == np.uint8:
|
||||||
|
rlt = rlt.round()
|
||||||
|
else:
|
||||||
|
rlt /= 255.
|
||||||
|
return rlt.astype(in_img_type)
|
||||||
|
|
||||||
|
|
||||||
|
def bgr2ycbcr(img, only_y=True):
|
||||||
|
'''bgr version of rgb2ycbcr
|
||||||
|
only_y: only return Y channel
|
||||||
|
Input:
|
||||||
|
uint8, [0, 255]
|
||||||
|
float, [0, 1]
|
||||||
|
'''
|
||||||
|
in_img_type = img.dtype
|
||||||
|
img.astype(np.float32)
|
||||||
|
if in_img_type != np.uint8:
|
||||||
|
img *= 255.
|
||||||
|
# convert
|
||||||
|
if only_y:
|
||||||
|
rlt = np.dot(img, [24.966, 128.553, 65.481]) / 255.0 + 16.0
|
||||||
|
else:
|
||||||
|
rlt = np.matmul(img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
|
||||||
|
[65.481, -37.797, 112.0]]) / 255.0 + [16, 128, 128]
|
||||||
|
if in_img_type == np.uint8:
|
||||||
|
rlt = rlt.round()
|
||||||
|
else:
|
||||||
|
rlt /= 255.
|
||||||
|
return rlt.astype(in_img_type)
|
||||||
|
|
||||||
|
|
||||||
|
def channel_convert(in_c, tar_type, img_list):
|
||||||
|
# conversion among BGR, gray and y
|
||||||
|
if in_c == 3 and tar_type == 'gray': # BGR to gray
|
||||||
|
gray_list = [cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) for img in img_list]
|
||||||
|
return [np.expand_dims(img, axis=2) for img in gray_list]
|
||||||
|
elif in_c == 3 and tar_type == 'y': # BGR to y
|
||||||
|
y_list = [bgr2ycbcr(img, only_y=True) for img in img_list]
|
||||||
|
return [np.expand_dims(img, axis=2) for img in y_list]
|
||||||
|
elif in_c == 1 and tar_type == 'RGB': # gray/y to BGR
|
||||||
|
return [cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) for img in img_list]
|
||||||
|
else:
|
||||||
|
return img_list
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
# --------------------------------------------
|
||||||
|
# metric, PSNR and SSIM
|
||||||
|
# --------------------------------------------
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------
|
||||||
|
# PSNR
|
||||||
|
# --------------------------------------------
|
||||||
|
def calculate_psnr(img1, img2, border=0):
|
||||||
|
# img1 and img2 have range [0, 255]
|
||||||
|
#img1 = img1.squeeze()
|
||||||
|
#img2 = img2.squeeze()
|
||||||
|
if not img1.shape == img2.shape:
|
||||||
|
raise ValueError('Input images must have the same dimensions.')
|
||||||
|
h, w = img1.shape[:2]
|
||||||
|
img1 = img1[border:h-border, border:w-border]
|
||||||
|
img2 = img2[border:h-border, border:w-border]
|
||||||
|
|
||||||
|
img1 = img1.astype(np.float64)
|
||||||
|
img2 = img2.astype(np.float64)
|
||||||
|
mse = np.mean((img1 - img2)**2)
|
||||||
|
if mse == 0:
|
||||||
|
return float('inf')
|
||||||
|
return 20 * math.log10(255.0 / math.sqrt(mse))
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------
|
||||||
|
# SSIM
|
||||||
|
# --------------------------------------------
|
||||||
|
def calculate_ssim(img1, img2, border=0):
|
||||||
|
'''calculate SSIM
|
||||||
|
the same outputs as MATLAB's
|
||||||
|
img1, img2: [0, 255]
|
||||||
|
'''
|
||||||
|
#img1 = img1.squeeze()
|
||||||
|
#img2 = img2.squeeze()
|
||||||
|
if not img1.shape == img2.shape:
|
||||||
|
raise ValueError('Input images must have the same dimensions.')
|
||||||
|
h, w = img1.shape[:2]
|
||||||
|
img1 = img1[border:h-border, border:w-border]
|
||||||
|
img2 = img2[border:h-border, border:w-border]
|
||||||
|
|
||||||
|
if img1.ndim == 2:
|
||||||
|
return ssim(img1, img2)
|
||||||
|
elif img1.ndim == 3:
|
||||||
|
if img1.shape[2] == 3:
|
||||||
|
ssims = []
|
||||||
|
for i in range(3):
|
||||||
|
ssims.append(ssim(img1[:,:,i], img2[:,:,i]))
|
||||||
|
return np.array(ssims).mean()
|
||||||
|
elif img1.shape[2] == 1:
|
||||||
|
return ssim(np.squeeze(img1), np.squeeze(img2))
|
||||||
|
else:
|
||||||
|
raise ValueError('Wrong input image dimensions.')
|
||||||
|
|
||||||
|
|
||||||
|
def ssim(img1, img2):
|
||||||
|
C1 = (0.01 * 255)**2
|
||||||
|
C2 = (0.03 * 255)**2
|
||||||
|
|
||||||
|
img1 = img1.astype(np.float64)
|
||||||
|
img2 = img2.astype(np.float64)
|
||||||
|
kernel = cv2.getGaussianKernel(11, 1.5)
|
||||||
|
window = np.outer(kernel, kernel.transpose())
|
||||||
|
|
||||||
|
mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5] # valid
|
||||||
|
mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
|
||||||
|
mu1_sq = mu1**2
|
||||||
|
mu2_sq = mu2**2
|
||||||
|
mu1_mu2 = mu1 * mu2
|
||||||
|
sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
|
||||||
|
sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
|
||||||
|
sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
|
||||||
|
|
||||||
|
ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
|
||||||
|
(sigma1_sq + sigma2_sq + C2))
|
||||||
|
return ssim_map.mean()
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
# --------------------------------------------
|
||||||
|
# matlab's bicubic imresize (numpy and torch) [0, 1]
|
||||||
|
# --------------------------------------------
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
# matlab 'imresize' function, now only support 'bicubic'
|
||||||
|
def cubic(x):
|
||||||
|
absx = torch.abs(x)
|
||||||
|
absx2 = absx**2
|
||||||
|
absx3 = absx**3
|
||||||
|
return (1.5*absx3 - 2.5*absx2 + 1) * ((absx <= 1).type_as(absx)) + \
|
||||||
|
(-0.5*absx3 + 2.5*absx2 - 4*absx + 2) * (((absx > 1)*(absx <= 2)).type_as(absx))
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing):
|
||||||
|
if (scale < 1) and (antialiasing):
|
||||||
|
# Use a modified kernel to simultaneously interpolate and antialias- larger kernel width
|
||||||
|
kernel_width = kernel_width / scale
|
||||||
|
|
||||||
|
# Output-space coordinates
|
||||||
|
x = torch.linspace(1, out_length, out_length)
|
||||||
|
|
||||||
|
# Input-space coordinates. Calculate the inverse mapping such that 0.5
|
||||||
|
# in output space maps to 0.5 in input space, and 0.5+scale in output
|
||||||
|
# space maps to 1.5 in input space.
|
||||||
|
u = x / scale + 0.5 * (1 - 1 / scale)
|
||||||
|
|
||||||
|
# What is the left-most pixel that can be involved in the computation?
|
||||||
|
left = torch.floor(u - kernel_width / 2)
|
||||||
|
|
||||||
|
# What is the maximum number of pixels that can be involved in the
|
||||||
|
# computation? Note: it's OK to use an extra pixel here; if the
|
||||||
|
# corresponding weights are all zero, it will be eliminated at the end
|
||||||
|
# of this function.
|
||||||
|
P = math.ceil(kernel_width) + 2
|
||||||
|
|
||||||
|
# The indices of the input pixels involved in computing the k-th output
|
||||||
|
# pixel are in row k of the indices matrix.
|
||||||
|
indices = left.view(out_length, 1).expand(out_length, P) + torch.linspace(0, P - 1, P).view(
|
||||||
|
1, P).expand(out_length, P)
|
||||||
|
|
||||||
|
# The weights used to compute the k-th output pixel are in row k of the
|
||||||
|
# weights matrix.
|
||||||
|
distance_to_center = u.view(out_length, 1).expand(out_length, P) - indices
|
||||||
|
# apply cubic kernel
|
||||||
|
if (scale < 1) and (antialiasing):
|
||||||
|
weights = scale * cubic(distance_to_center * scale)
|
||||||
|
else:
|
||||||
|
weights = cubic(distance_to_center)
|
||||||
|
# Normalize the weights matrix so that each row sums to 1.
|
||||||
|
weights_sum = torch.sum(weights, 1).view(out_length, 1)
|
||||||
|
weights = weights / weights_sum.expand(out_length, P)
|
||||||
|
|
||||||
|
# If a column in weights is all zero, get rid of it. only consider the first and last column.
|
||||||
|
weights_zero_tmp = torch.sum((weights == 0), 0)
|
||||||
|
if not math.isclose(weights_zero_tmp[0], 0, rel_tol=1e-6):
|
||||||
|
indices = indices.narrow(1, 1, P - 2)
|
||||||
|
weights = weights.narrow(1, 1, P - 2)
|
||||||
|
if not math.isclose(weights_zero_tmp[-1], 0, rel_tol=1e-6):
|
||||||
|
indices = indices.narrow(1, 0, P - 2)
|
||||||
|
weights = weights.narrow(1, 0, P - 2)
|
||||||
|
weights = weights.contiguous()
|
||||||
|
indices = indices.contiguous()
|
||||||
|
sym_len_s = -indices.min() + 1
|
||||||
|
sym_len_e = indices.max() - in_length
|
||||||
|
indices = indices + sym_len_s - 1
|
||||||
|
return weights, indices, int(sym_len_s), int(sym_len_e)
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------
|
||||||
|
# imresize for tensor image [0, 1]
|
||||||
|
# --------------------------------------------
|
||||||
|
def imresize(img, scale, antialiasing=True):
|
||||||
|
# Now the scale should be the same for H and W
|
||||||
|
# input: img: pytorch tensor, CHW or HW [0,1]
|
||||||
|
# output: CHW or HW [0,1] w/o round
|
||||||
|
need_squeeze = True if img.dim() == 2 else False
|
||||||
|
if need_squeeze:
|
||||||
|
img.unsqueeze_(0)
|
||||||
|
in_C, in_H, in_W = img.size()
|
||||||
|
out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
|
||||||
|
kernel_width = 4
|
||||||
|
kernel = 'cubic'
|
||||||
|
|
||||||
|
# Return the desired dimension order for performing the resize. The
|
||||||
|
# strategy is to perform the resize first along the dimension with the
|
||||||
|
# smallest scale factor.
|
||||||
|
# Now we do not support this.
|
||||||
|
|
||||||
|
# get weights and indices
|
||||||
|
weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
|
||||||
|
in_H, out_H, scale, kernel, kernel_width, antialiasing)
|
||||||
|
weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
|
||||||
|
in_W, out_W, scale, kernel, kernel_width, antialiasing)
|
||||||
|
# process H dimension
|
||||||
|
# symmetric copying
|
||||||
|
img_aug = torch.FloatTensor(in_C, in_H + sym_len_Hs + sym_len_He, in_W)
|
||||||
|
img_aug.narrow(1, sym_len_Hs, in_H).copy_(img)
|
||||||
|
|
||||||
|
sym_patch = img[:, :sym_len_Hs, :]
|
||||||
|
inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
|
||||||
|
sym_patch_inv = sym_patch.index_select(1, inv_idx)
|
||||||
|
img_aug.narrow(1, 0, sym_len_Hs).copy_(sym_patch_inv)
|
||||||
|
|
||||||
|
sym_patch = img[:, -sym_len_He:, :]
|
||||||
|
inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
|
||||||
|
sym_patch_inv = sym_patch.index_select(1, inv_idx)
|
||||||
|
img_aug.narrow(1, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
|
||||||
|
|
||||||
|
out_1 = torch.FloatTensor(in_C, out_H, in_W)
|
||||||
|
kernel_width = weights_H.size(1)
|
||||||
|
for i in range(out_H):
|
||||||
|
idx = int(indices_H[i][0])
|
||||||
|
for j in range(out_C):
|
||||||
|
out_1[j, i, :] = img_aug[j, idx:idx + kernel_width, :].transpose(0, 1).mv(weights_H[i])
|
||||||
|
|
||||||
|
# process W dimension
|
||||||
|
# symmetric copying
|
||||||
|
out_1_aug = torch.FloatTensor(in_C, out_H, in_W + sym_len_Ws + sym_len_We)
|
||||||
|
out_1_aug.narrow(2, sym_len_Ws, in_W).copy_(out_1)
|
||||||
|
|
||||||
|
sym_patch = out_1[:, :, :sym_len_Ws]
|
||||||
|
inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
|
||||||
|
sym_patch_inv = sym_patch.index_select(2, inv_idx)
|
||||||
|
out_1_aug.narrow(2, 0, sym_len_Ws).copy_(sym_patch_inv)
|
||||||
|
|
||||||
|
sym_patch = out_1[:, :, -sym_len_We:]
|
||||||
|
inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
|
||||||
|
sym_patch_inv = sym_patch.index_select(2, inv_idx)
|
||||||
|
out_1_aug.narrow(2, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
|
||||||
|
|
||||||
|
out_2 = torch.FloatTensor(in_C, out_H, out_W)
|
||||||
|
kernel_width = weights_W.size(1)
|
||||||
|
for i in range(out_W):
|
||||||
|
idx = int(indices_W[i][0])
|
||||||
|
for j in range(out_C):
|
||||||
|
out_2[j, :, i] = out_1_aug[j, :, idx:idx + kernel_width].mv(weights_W[i])
|
||||||
|
if need_squeeze:
|
||||||
|
out_2.squeeze_()
|
||||||
|
return out_2
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------
|
||||||
|
# imresize for numpy image [0, 1]
|
||||||
|
# --------------------------------------------
|
||||||
|
def imresize_np(img, scale, antialiasing=True):
|
||||||
|
# Now the scale should be the same for H and W
|
||||||
|
# input: img: Numpy, HWC or HW [0,1]
|
||||||
|
# output: HWC or HW [0,1] w/o round
|
||||||
|
img = torch.from_numpy(img)
|
||||||
|
need_squeeze = True if img.dim() == 2 else False
|
||||||
|
if need_squeeze:
|
||||||
|
img.unsqueeze_(2)
|
||||||
|
|
||||||
|
in_H, in_W, in_C = img.size()
|
||||||
|
out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
|
||||||
|
kernel_width = 4
|
||||||
|
kernel = 'cubic'
|
||||||
|
|
||||||
|
# Return the desired dimension order for performing the resize. The
|
||||||
|
# strategy is to perform the resize first along the dimension with the
|
||||||
|
# smallest scale factor.
|
||||||
|
# Now we do not support this.
|
||||||
|
|
||||||
|
# get weights and indices
|
||||||
|
weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
|
||||||
|
in_H, out_H, scale, kernel, kernel_width, antialiasing)
|
||||||
|
weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
|
||||||
|
in_W, out_W, scale, kernel, kernel_width, antialiasing)
|
||||||
|
# process H dimension
|
||||||
|
# symmetric copying
|
||||||
|
img_aug = torch.FloatTensor(in_H + sym_len_Hs + sym_len_He, in_W, in_C)
|
||||||
|
img_aug.narrow(0, sym_len_Hs, in_H).copy_(img)
|
||||||
|
|
||||||
|
sym_patch = img[:sym_len_Hs, :, :]
|
||||||
|
inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
|
||||||
|
sym_patch_inv = sym_patch.index_select(0, inv_idx)
|
||||||
|
img_aug.narrow(0, 0, sym_len_Hs).copy_(sym_patch_inv)
|
||||||
|
|
||||||
|
sym_patch = img[-sym_len_He:, :, :]
|
||||||
|
inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long()
|
||||||
|
sym_patch_inv = sym_patch.index_select(0, inv_idx)
|
||||||
|
img_aug.narrow(0, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv)
|
||||||
|
|
||||||
|
out_1 = torch.FloatTensor(out_H, in_W, in_C)
|
||||||
|
kernel_width = weights_H.size(1)
|
||||||
|
for i in range(out_H):
|
||||||
|
idx = int(indices_H[i][0])
|
||||||
|
for j in range(out_C):
|
||||||
|
out_1[i, :, j] = img_aug[idx:idx + kernel_width, :, j].transpose(0, 1).mv(weights_H[i])
|
||||||
|
|
||||||
|
# process W dimension
|
||||||
|
# symmetric copying
|
||||||
|
out_1_aug = torch.FloatTensor(out_H, in_W + sym_len_Ws + sym_len_We, in_C)
|
||||||
|
out_1_aug.narrow(1, sym_len_Ws, in_W).copy_(out_1)
|
||||||
|
|
||||||
|
sym_patch = out_1[:, :sym_len_Ws, :]
|
||||||
|
inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
|
||||||
|
sym_patch_inv = sym_patch.index_select(1, inv_idx)
|
||||||
|
out_1_aug.narrow(1, 0, sym_len_Ws).copy_(sym_patch_inv)
|
||||||
|
|
||||||
|
sym_patch = out_1[:, -sym_len_We:, :]
|
||||||
|
inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
|
||||||
|
sym_patch_inv = sym_patch.index_select(1, inv_idx)
|
||||||
|
out_1_aug.narrow(1, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv)
|
||||||
|
|
||||||
|
out_2 = torch.FloatTensor(out_H, out_W, in_C)
|
||||||
|
kernel_width = weights_W.size(1)
|
||||||
|
for i in range(out_W):
|
||||||
|
idx = int(indices_W[i][0])
|
||||||
|
for j in range(out_C):
|
||||||
|
out_2[:, i, j] = out_1_aug[:, idx:idx + kernel_width, j].mv(weights_W[i])
|
||||||
|
if need_squeeze:
|
||||||
|
out_2.squeeze_()
|
||||||
|
|
||||||
|
return out_2.numpy()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print('---')
|
||||||
|
# img = imread_uint('test.bmp', 3)
|
||||||
|
# img = uint2single(img)
|
||||||
|
# img_bicubic = imresize_np(img, 1/4)
|
170
AnyText/AnyText_scripts/ldm/modules/midas/api.py
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
# based on https://github.com/isl-org/MiDaS
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torchvision.transforms import Compose
|
||||||
|
|
||||||
|
from .midas.dpt_depth import DPTDepthModel
|
||||||
|
from .midas.midas_net import MidasNet
|
||||||
|
from .midas.midas_net_custom import MidasNet_small
|
||||||
|
from .midas.transforms import Resize, NormalizeImage, PrepareForNet
|
||||||
|
|
||||||
|
|
||||||
|
ISL_PATHS = {
|
||||||
|
"dpt_large": "midas_models/dpt_large-midas-2f21e586.pt",
|
||||||
|
"dpt_hybrid": "midas_models/dpt_hybrid-midas-501f0c75.pt",
|
||||||
|
"midas_v21": "",
|
||||||
|
"midas_v21_small": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def disabled_train(self, mode=True):
|
||||||
|
"""Overwrite model.train with this function to make sure train/eval mode
|
||||||
|
does not change anymore."""
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def load_midas_transform(model_type):
|
||||||
|
# https://github.com/isl-org/MiDaS/blob/master/run.py
|
||||||
|
# load transform only
|
||||||
|
if model_type == "dpt_large": # DPT-Large
|
||||||
|
net_w, net_h = 384, 384
|
||||||
|
resize_mode = "minimal"
|
||||||
|
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||||
|
|
||||||
|
elif model_type == "dpt_hybrid": # DPT-Hybrid
|
||||||
|
net_w, net_h = 384, 384
|
||||||
|
resize_mode = "minimal"
|
||||||
|
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||||
|
|
||||||
|
elif model_type == "midas_v21":
|
||||||
|
net_w, net_h = 384, 384
|
||||||
|
resize_mode = "upper_bound"
|
||||||
|
normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||||
|
|
||||||
|
elif model_type == "midas_v21_small":
|
||||||
|
net_w, net_h = 256, 256
|
||||||
|
resize_mode = "upper_bound"
|
||||||
|
normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||||
|
|
||||||
|
else:
|
||||||
|
assert False, f"model_type '{model_type}' not implemented, use: --model_type large"
|
||||||
|
|
||||||
|
transform = Compose(
|
||||||
|
[
|
||||||
|
Resize(
|
||||||
|
net_w,
|
||||||
|
net_h,
|
||||||
|
resize_target=None,
|
||||||
|
keep_aspect_ratio=True,
|
||||||
|
ensure_multiple_of=32,
|
||||||
|
resize_method=resize_mode,
|
||||||
|
image_interpolation_method=cv2.INTER_CUBIC,
|
||||||
|
),
|
||||||
|
normalization,
|
||||||
|
PrepareForNet(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
return transform
|
||||||
|
|
||||||
|
|
||||||
|
def load_model(model_type):
|
||||||
|
# https://github.com/isl-org/MiDaS/blob/master/run.py
|
||||||
|
# load network
|
||||||
|
model_path = ISL_PATHS[model_type]
|
||||||
|
if model_type == "dpt_large": # DPT-Large
|
||||||
|
model = DPTDepthModel(
|
||||||
|
path=model_path,
|
||||||
|
backbone="vitl16_384",
|
||||||
|
non_negative=True,
|
||||||
|
)
|
||||||
|
net_w, net_h = 384, 384
|
||||||
|
resize_mode = "minimal"
|
||||||
|
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||||
|
|
||||||
|
elif model_type == "dpt_hybrid": # DPT-Hybrid
|
||||||
|
model = DPTDepthModel(
|
||||||
|
path=model_path,
|
||||||
|
backbone="vitb_rn50_384",
|
||||||
|
non_negative=True,
|
||||||
|
)
|
||||||
|
net_w, net_h = 384, 384
|
||||||
|
resize_mode = "minimal"
|
||||||
|
normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
||||||
|
|
||||||
|
elif model_type == "midas_v21":
|
||||||
|
model = MidasNet(model_path, non_negative=True)
|
||||||
|
net_w, net_h = 384, 384
|
||||||
|
resize_mode = "upper_bound"
|
||||||
|
normalization = NormalizeImage(
|
||||||
|
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
||||||
|
)
|
||||||
|
|
||||||
|
elif model_type == "midas_v21_small":
|
||||||
|
model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
|
||||||
|
non_negative=True, blocks={'expand': True})
|
||||||
|
net_w, net_h = 256, 256
|
||||||
|
resize_mode = "upper_bound"
|
||||||
|
normalization = NormalizeImage(
|
||||||
|
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f"model_type '{model_type}' not implemented, use: --model_type large")
|
||||||
|
assert False
|
||||||
|
|
||||||
|
transform = Compose(
|
||||||
|
[
|
||||||
|
Resize(
|
||||||
|
net_w,
|
||||||
|
net_h,
|
||||||
|
resize_target=None,
|
||||||
|
keep_aspect_ratio=True,
|
||||||
|
ensure_multiple_of=32,
|
||||||
|
resize_method=resize_mode,
|
||||||
|
image_interpolation_method=cv2.INTER_CUBIC,
|
||||||
|
),
|
||||||
|
normalization,
|
||||||
|
PrepareForNet(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
return model.eval(), transform
|
||||||
|
|
||||||
|
|
||||||
|
class MiDaSInference(nn.Module):
|
||||||
|
MODEL_TYPES_TORCH_HUB = [
|
||||||
|
"DPT_Large",
|
||||||
|
"DPT_Hybrid",
|
||||||
|
"MiDaS_small"
|
||||||
|
]
|
||||||
|
MODEL_TYPES_ISL = [
|
||||||
|
"dpt_large",
|
||||||
|
"dpt_hybrid",
|
||||||
|
"midas_v21",
|
||||||
|
"midas_v21_small",
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self, model_type):
|
||||||
|
super().__init__()
|
||||||
|
assert (model_type in self.MODEL_TYPES_ISL)
|
||||||
|
model, _ = load_model(model_type)
|
||||||
|
self.model = model
|
||||||
|
self.model.train = disabled_train
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# x in 0..1 as produced by calling self.transform on a 0..1 float64 numpy array
|
||||||
|
# NOTE: we expect that the correct transform has been called during dataloading.
|
||||||
|
with torch.no_grad():
|
||||||
|
prediction = self.model(x)
|
||||||
|
prediction = torch.nn.functional.interpolate(
|
||||||
|
prediction.unsqueeze(1),
|
||||||
|
size=x.shape[2:],
|
||||||
|
mode="bicubic",
|
||||||
|
align_corners=False,
|
||||||
|
)
|
||||||
|
assert prediction.shape == (x.shape[0], 1, x.shape[2], x.shape[3])
|
||||||
|
return prediction
|
||||||
|
|
@ -0,0 +1,16 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
class BaseModel(torch.nn.Module):
|
||||||
|
def load(self, path):
|
||||||
|
"""Load model from file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): file path
|
||||||
|
"""
|
||||||
|
parameters = torch.load(path, map_location=torch.device('cpu'))
|
||||||
|
|
||||||
|
if "optimizer" in parameters:
|
||||||
|
parameters = parameters["model"]
|
||||||
|
|
||||||
|
self.load_state_dict(parameters)
|
342
AnyText/AnyText_scripts/ldm/modules/midas/midas/blocks.py
Normal file
@ -0,0 +1,342 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from .vit import (
|
||||||
|
_make_pretrained_vitb_rn50_384,
|
||||||
|
_make_pretrained_vitl16_384,
|
||||||
|
_make_pretrained_vitb16_384,
|
||||||
|
forward_vit,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore",):
|
||||||
|
if backbone == "vitl16_384":
|
||||||
|
pretrained = _make_pretrained_vitl16_384(
|
||||||
|
use_pretrained, hooks=hooks, use_readout=use_readout
|
||||||
|
)
|
||||||
|
scratch = _make_scratch(
|
||||||
|
[256, 512, 1024, 1024], features, groups=groups, expand=expand
|
||||||
|
) # ViT-L/16 - 85.0% Top1 (backbone)
|
||||||
|
elif backbone == "vitb_rn50_384":
|
||||||
|
pretrained = _make_pretrained_vitb_rn50_384(
|
||||||
|
use_pretrained,
|
||||||
|
hooks=hooks,
|
||||||
|
use_vit_only=use_vit_only,
|
||||||
|
use_readout=use_readout,
|
||||||
|
)
|
||||||
|
scratch = _make_scratch(
|
||||||
|
[256, 512, 768, 768], features, groups=groups, expand=expand
|
||||||
|
) # ViT-H/16 - 85.0% Top1 (backbone)
|
||||||
|
elif backbone == "vitb16_384":
|
||||||
|
pretrained = _make_pretrained_vitb16_384(
|
||||||
|
use_pretrained, hooks=hooks, use_readout=use_readout
|
||||||
|
)
|
||||||
|
scratch = _make_scratch(
|
||||||
|
[96, 192, 384, 768], features, groups=groups, expand=expand
|
||||||
|
) # ViT-B/16 - 84.6% Top1 (backbone)
|
||||||
|
elif backbone == "resnext101_wsl":
|
||||||
|
pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
|
||||||
|
scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand) # efficientnet_lite3
|
||||||
|
elif backbone == "efficientnet_lite3":
|
||||||
|
pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable)
|
||||||
|
scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand) # efficientnet_lite3
|
||||||
|
else:
|
||||||
|
print(f"Backbone '{backbone}' not implemented")
|
||||||
|
assert False
|
||||||
|
|
||||||
|
return pretrained, scratch
|
||||||
|
|
||||||
|
|
||||||
|
def _make_scratch(in_shape, out_shape, groups=1, expand=False):
|
||||||
|
scratch = nn.Module()
|
||||||
|
|
||||||
|
out_shape1 = out_shape
|
||||||
|
out_shape2 = out_shape
|
||||||
|
out_shape3 = out_shape
|
||||||
|
out_shape4 = out_shape
|
||||||
|
if expand==True:
|
||||||
|
out_shape1 = out_shape
|
||||||
|
out_shape2 = out_shape*2
|
||||||
|
out_shape3 = out_shape*4
|
||||||
|
out_shape4 = out_shape*8
|
||||||
|
|
||||||
|
scratch.layer1_rn = nn.Conv2d(
|
||||||
|
in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
||||||
|
)
|
||||||
|
scratch.layer2_rn = nn.Conv2d(
|
||||||
|
in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
||||||
|
)
|
||||||
|
scratch.layer3_rn = nn.Conv2d(
|
||||||
|
in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
||||||
|
)
|
||||||
|
scratch.layer4_rn = nn.Conv2d(
|
||||||
|
in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
||||||
|
)
|
||||||
|
|
||||||
|
return scratch
|
||||||
|
|
||||||
|
|
||||||
|
def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
|
||||||
|
efficientnet = torch.hub.load(
|
||||||
|
"rwightman/gen-efficientnet-pytorch",
|
||||||
|
"tf_efficientnet_lite3",
|
||||||
|
pretrained=use_pretrained,
|
||||||
|
exportable=exportable
|
||||||
|
)
|
||||||
|
return _make_efficientnet_backbone(efficientnet)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_efficientnet_backbone(effnet):
|
||||||
|
pretrained = nn.Module()
|
||||||
|
|
||||||
|
pretrained.layer1 = nn.Sequential(
|
||||||
|
effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2]
|
||||||
|
)
|
||||||
|
pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
|
||||||
|
pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
|
||||||
|
pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
|
||||||
|
|
||||||
|
return pretrained
|
||||||
|
|
||||||
|
|
||||||
|
def _make_resnet_backbone(resnet):
|
||||||
|
pretrained = nn.Module()
|
||||||
|
pretrained.layer1 = nn.Sequential(
|
||||||
|
resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
|
||||||
|
)
|
||||||
|
|
||||||
|
pretrained.layer2 = resnet.layer2
|
||||||
|
pretrained.layer3 = resnet.layer3
|
||||||
|
pretrained.layer4 = resnet.layer4
|
||||||
|
|
||||||
|
return pretrained
|
||||||
|
|
||||||
|
|
||||||
|
def _make_pretrained_resnext101_wsl(use_pretrained):
|
||||||
|
resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
|
||||||
|
return _make_resnet_backbone(resnet)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Interpolate(nn.Module):
|
||||||
|
"""Interpolation module.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, scale_factor, mode, align_corners=False):
|
||||||
|
"""Init.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
scale_factor (float): scaling
|
||||||
|
mode (str): interpolation mode
|
||||||
|
"""
|
||||||
|
super(Interpolate, self).__init__()
|
||||||
|
|
||||||
|
self.interp = nn.functional.interpolate
|
||||||
|
self.scale_factor = scale_factor
|
||||||
|
self.mode = mode
|
||||||
|
self.align_corners = align_corners
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""Forward pass.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (tensor): input
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tensor: interpolated data
|
||||||
|
"""
|
||||||
|
|
||||||
|
x = self.interp(
|
||||||
|
x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners
|
||||||
|
)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class ResidualConvUnit(nn.Module):
|
||||||
|
"""Residual convolution module.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, features):
|
||||||
|
"""Init.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
features (int): number of features
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.conv1 = nn.Conv2d(
|
||||||
|
features, features, kernel_size=3, stride=1, padding=1, bias=True
|
||||||
|
)
|
||||||
|
|
||||||
|
self.conv2 = nn.Conv2d(
|
||||||
|
features, features, kernel_size=3, stride=1, padding=1, bias=True
|
||||||
|
)
|
||||||
|
|
||||||
|
self.relu = nn.ReLU(inplace=True)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""Forward pass.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (tensor): input
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tensor: output
|
||||||
|
"""
|
||||||
|
out = self.relu(x)
|
||||||
|
out = self.conv1(out)
|
||||||
|
out = self.relu(out)
|
||||||
|
out = self.conv2(out)
|
||||||
|
|
||||||
|
return out + x
|
||||||
|
|
||||||
|
|
||||||
|
class FeatureFusionBlock(nn.Module):
|
||||||
|
"""Feature fusion block.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, features):
|
||||||
|
"""Init.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
features (int): number of features
|
||||||
|
"""
|
||||||
|
super(FeatureFusionBlock, self).__init__()
|
||||||
|
|
||||||
|
self.resConfUnit1 = ResidualConvUnit(features)
|
||||||
|
self.resConfUnit2 = ResidualConvUnit(features)
|
||||||
|
|
||||||
|
def forward(self, *xs):
|
||||||
|
"""Forward pass.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tensor: output
|
||||||
|
"""
|
||||||
|
output = xs[0]
|
||||||
|
|
||||||
|
if len(xs) == 2:
|
||||||
|
output += self.resConfUnit1(xs[1])
|
||||||
|
|
||||||
|
output = self.resConfUnit2(output)
|
||||||
|
|
||||||
|
output = nn.functional.interpolate(
|
||||||
|
output, scale_factor=2, mode="bilinear", align_corners=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class ResidualConvUnit_custom(nn.Module):
|
||||||
|
"""Residual convolution module.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, features, activation, bn):
|
||||||
|
"""Init.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
features (int): number of features
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.bn = bn
|
||||||
|
|
||||||
|
self.groups=1
|
||||||
|
|
||||||
|
self.conv1 = nn.Conv2d(
|
||||||
|
features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
|
||||||
|
)
|
||||||
|
|
||||||
|
self.conv2 = nn.Conv2d(
|
||||||
|
features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.bn==True:
|
||||||
|
self.bn1 = nn.BatchNorm2d(features)
|
||||||
|
self.bn2 = nn.BatchNorm2d(features)
|
||||||
|
|
||||||
|
self.activation = activation
|
||||||
|
|
||||||
|
self.skip_add = nn.quantized.FloatFunctional()
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""Forward pass.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (tensor): input
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tensor: output
|
||||||
|
"""
|
||||||
|
|
||||||
|
out = self.activation(x)
|
||||||
|
out = self.conv1(out)
|
||||||
|
if self.bn==True:
|
||||||
|
out = self.bn1(out)
|
||||||
|
|
||||||
|
out = self.activation(out)
|
||||||
|
out = self.conv2(out)
|
||||||
|
if self.bn==True:
|
||||||
|
out = self.bn2(out)
|
||||||
|
|
||||||
|
if self.groups > 1:
|
||||||
|
out = self.conv_merge(out)
|
||||||
|
|
||||||
|
return self.skip_add.add(out, x)
|
||||||
|
|
||||||
|
# return out + x
|
||||||
|
|
||||||
|
|
||||||
|
class FeatureFusionBlock_custom(nn.Module):
|
||||||
|
"""Feature fusion block.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True):
|
||||||
|
"""Init.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
features (int): number of features
|
||||||
|
"""
|
||||||
|
super(FeatureFusionBlock_custom, self).__init__()
|
||||||
|
|
||||||
|
self.deconv = deconv
|
||||||
|
self.align_corners = align_corners
|
||||||
|
|
||||||
|
self.groups=1
|
||||||
|
|
||||||
|
self.expand = expand
|
||||||
|
out_features = features
|
||||||
|
if self.expand==True:
|
||||||
|
out_features = features//2
|
||||||
|
|
||||||
|
self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
|
||||||
|
|
||||||
|
self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
|
||||||
|
self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
|
||||||
|
|
||||||
|
self.skip_add = nn.quantized.FloatFunctional()
|
||||||
|
|
||||||
|
def forward(self, *xs):
|
||||||
|
"""Forward pass.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tensor: output
|
||||||
|
"""
|
||||||
|
output = xs[0]
|
||||||
|
|
||||||
|
if len(xs) == 2:
|
||||||
|
res = self.resConfUnit1(xs[1])
|
||||||
|
output = self.skip_add.add(output, res)
|
||||||
|
# output += res
|
||||||
|
|
||||||
|
output = self.resConfUnit2(output)
|
||||||
|
|
||||||
|
output = nn.functional.interpolate(
|
||||||
|
output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
|
||||||
|
)
|
||||||
|
|
||||||
|
output = self.out_conv(output)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
109
AnyText/AnyText_scripts/ldm/modules/midas/midas/dpt_depth.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
from .base_model import BaseModel
|
||||||
|
from .blocks import (
|
||||||
|
FeatureFusionBlock,
|
||||||
|
FeatureFusionBlock_custom,
|
||||||
|
Interpolate,
|
||||||
|
_make_encoder,
|
||||||
|
forward_vit,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_fusion_block(features, use_bn):
|
||||||
|
return FeatureFusionBlock_custom(
|
||||||
|
features,
|
||||||
|
nn.ReLU(False),
|
||||||
|
deconv=False,
|
||||||
|
bn=use_bn,
|
||||||
|
expand=False,
|
||||||
|
align_corners=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class DPT(BaseModel):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
head,
|
||||||
|
features=256,
|
||||||
|
backbone="vitb_rn50_384",
|
||||||
|
readout="project",
|
||||||
|
channels_last=False,
|
||||||
|
use_bn=False,
|
||||||
|
):
|
||||||
|
|
||||||
|
super(DPT, self).__init__()
|
||||||
|
|
||||||
|
self.channels_last = channels_last
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"vitb_rn50_384": [0, 1, 8, 11],
|
||||||
|
"vitb16_384": [2, 5, 8, 11],
|
||||||
|
"vitl16_384": [5, 11, 17, 23],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Instantiate backbone and reassemble blocks
|
||||||
|
self.pretrained, self.scratch = _make_encoder(
|
||||||
|
backbone,
|
||||||
|
features,
|
||||||
|
False, # Set to true of you want to train from scratch, uses ImageNet weights
|
||||||
|
groups=1,
|
||||||
|
expand=False,
|
||||||
|
exportable=False,
|
||||||
|
hooks=hooks[backbone],
|
||||||
|
use_readout=readout,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
|
||||||
|
self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
|
||||||
|
self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
|
||||||
|
self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
|
||||||
|
|
||||||
|
self.scratch.output_conv = head
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.channels_last == True:
|
||||||
|
x.contiguous(memory_format=torch.channels_last)
|
||||||
|
|
||||||
|
layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
|
||||||
|
|
||||||
|
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
||||||
|
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
||||||
|
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
||||||
|
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
||||||
|
|
||||||
|
path_4 = self.scratch.refinenet4(layer_4_rn)
|
||||||
|
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
|
||||||
|
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
|
||||||
|
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
|
||||||
|
|
||||||
|
out = self.scratch.output_conv(path_1)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class DPTDepthModel(DPT):
|
||||||
|
def __init__(self, path=None, non_negative=True, **kwargs):
|
||||||
|
features = kwargs["features"] if "features" in kwargs else 256
|
||||||
|
|
||||||
|
head = nn.Sequential(
|
||||||
|
nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
|
||||||
|
Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
|
||||||
|
nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
|
||||||
|
nn.ReLU(True),
|
||||||
|
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
|
||||||
|
nn.ReLU(True) if non_negative else nn.Identity(),
|
||||||
|
nn.Identity(),
|
||||||
|
)
|
||||||
|
|
||||||
|
super().__init__(head, **kwargs)
|
||||||
|
|
||||||
|
if path is not None:
|
||||||
|
self.load(path)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return super().forward(x).squeeze(dim=1)
|
||||||
|
|
76
AnyText/AnyText_scripts/ldm/modules/midas/midas/midas_net.py
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
|
||||||
|
This file contains code that is adapted from
|
||||||
|
https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
|
||||||
|
"""
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from .base_model import BaseModel
|
||||||
|
from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
|
||||||
|
|
||||||
|
|
||||||
|
class MidasNet(BaseModel):
|
||||||
|
"""Network for monocular depth estimation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, path=None, features=256, non_negative=True):
|
||||||
|
"""Init.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str, optional): Path to saved model. Defaults to None.
|
||||||
|
features (int, optional): Number of features. Defaults to 256.
|
||||||
|
backbone (str, optional): Backbone network for encoder. Defaults to resnet50
|
||||||
|
"""
|
||||||
|
print("Loading weights: ", path)
|
||||||
|
|
||||||
|
super(MidasNet, self).__init__()
|
||||||
|
|
||||||
|
use_pretrained = False if path is None else True
|
||||||
|
|
||||||
|
self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
|
||||||
|
|
||||||
|
self.scratch.refinenet4 = FeatureFusionBlock(features)
|
||||||
|
self.scratch.refinenet3 = FeatureFusionBlock(features)
|
||||||
|
self.scratch.refinenet2 = FeatureFusionBlock(features)
|
||||||
|
self.scratch.refinenet1 = FeatureFusionBlock(features)
|
||||||
|
|
||||||
|
self.scratch.output_conv = nn.Sequential(
|
||||||
|
nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
|
||||||
|
Interpolate(scale_factor=2, mode="bilinear"),
|
||||||
|
nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
|
||||||
|
nn.ReLU(True),
|
||||||
|
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
|
||||||
|
nn.ReLU(True) if non_negative else nn.Identity(),
|
||||||
|
)
|
||||||
|
|
||||||
|
if path:
|
||||||
|
self.load(path)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""Forward pass.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (tensor): input data (image)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tensor: depth
|
||||||
|
"""
|
||||||
|
|
||||||
|
layer_1 = self.pretrained.layer1(x)
|
||||||
|
layer_2 = self.pretrained.layer2(layer_1)
|
||||||
|
layer_3 = self.pretrained.layer3(layer_2)
|
||||||
|
layer_4 = self.pretrained.layer4(layer_3)
|
||||||
|
|
||||||
|
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
||||||
|
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
||||||
|
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
||||||
|
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
||||||
|
|
||||||
|
path_4 = self.scratch.refinenet4(layer_4_rn)
|
||||||
|
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
|
||||||
|
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
|
||||||
|
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
|
||||||
|
|
||||||
|
out = self.scratch.output_conv(path_1)
|
||||||
|
|
||||||
|
return torch.squeeze(out, dim=1)
|
@ -0,0 +1,128 @@
|
|||||||
|
"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
|
||||||
|
This file contains code that is adapted from
|
||||||
|
https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
|
||||||
|
"""
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from .base_model import BaseModel
|
||||||
|
from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder
|
||||||
|
|
||||||
|
|
||||||
|
class MidasNet_small(BaseModel):
|
||||||
|
"""Network for monocular depth estimation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True,
|
||||||
|
blocks={'expand': True}):
|
||||||
|
"""Init.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str, optional): Path to saved model. Defaults to None.
|
||||||
|
features (int, optional): Number of features. Defaults to 256.
|
||||||
|
backbone (str, optional): Backbone network for encoder. Defaults to resnet50
|
||||||
|
"""
|
||||||
|
print("Loading weights: ", path)
|
||||||
|
|
||||||
|
super(MidasNet_small, self).__init__()
|
||||||
|
|
||||||
|
use_pretrained = False if path else True
|
||||||
|
|
||||||
|
self.channels_last = channels_last
|
||||||
|
self.blocks = blocks
|
||||||
|
self.backbone = backbone
|
||||||
|
|
||||||
|
self.groups = 1
|
||||||
|
|
||||||
|
features1=features
|
||||||
|
features2=features
|
||||||
|
features3=features
|
||||||
|
features4=features
|
||||||
|
self.expand = False
|
||||||
|
if "expand" in self.blocks and self.blocks['expand'] == True:
|
||||||
|
self.expand = True
|
||||||
|
features1=features
|
||||||
|
features2=features*2
|
||||||
|
features3=features*4
|
||||||
|
features4=features*8
|
||||||
|
|
||||||
|
self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable)
|
||||||
|
|
||||||
|
self.scratch.activation = nn.ReLU(False)
|
||||||
|
|
||||||
|
self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
|
||||||
|
self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
|
||||||
|
self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
|
||||||
|
self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners)
|
||||||
|
|
||||||
|
|
||||||
|
self.scratch.output_conv = nn.Sequential(
|
||||||
|
nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups),
|
||||||
|
Interpolate(scale_factor=2, mode="bilinear"),
|
||||||
|
nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
|
||||||
|
self.scratch.activation,
|
||||||
|
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
|
||||||
|
nn.ReLU(True) if non_negative else nn.Identity(),
|
||||||
|
nn.Identity(),
|
||||||
|
)
|
||||||
|
|
||||||
|
if path:
|
||||||
|
self.load(path)
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""Forward pass.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (tensor): input data (image)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tensor: depth
|
||||||
|
"""
|
||||||
|
if self.channels_last==True:
|
||||||
|
print("self.channels_last = ", self.channels_last)
|
||||||
|
x.contiguous(memory_format=torch.channels_last)
|
||||||
|
|
||||||
|
|
||||||
|
layer_1 = self.pretrained.layer1(x)
|
||||||
|
layer_2 = self.pretrained.layer2(layer_1)
|
||||||
|
layer_3 = self.pretrained.layer3(layer_2)
|
||||||
|
layer_4 = self.pretrained.layer4(layer_3)
|
||||||
|
|
||||||
|
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
||||||
|
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
||||||
|
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
||||||
|
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
||||||
|
|
||||||
|
|
||||||
|
path_4 = self.scratch.refinenet4(layer_4_rn)
|
||||||
|
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
|
||||||
|
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
|
||||||
|
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
|
||||||
|
|
||||||
|
out = self.scratch.output_conv(path_1)
|
||||||
|
|
||||||
|
return torch.squeeze(out, dim=1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def fuse_model(m):
|
||||||
|
prev_previous_type = nn.Identity()
|
||||||
|
prev_previous_name = ''
|
||||||
|
previous_type = nn.Identity()
|
||||||
|
previous_name = ''
|
||||||
|
for name, module in m.named_modules():
|
||||||
|
if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU:
|
||||||
|
# print("FUSED ", prev_previous_name, previous_name, name)
|
||||||
|
torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True)
|
||||||
|
elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d:
|
||||||
|
# print("FUSED ", prev_previous_name, previous_name)
|
||||||
|
torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True)
|
||||||
|
# elif previous_type == nn.Conv2d and type(module) == nn.ReLU:
|
||||||
|
# print("FUSED ", previous_name, name)
|
||||||
|
# torch.quantization.fuse_modules(m, [previous_name, name], inplace=True)
|
||||||
|
|
||||||
|
prev_previous_type = previous_type
|
||||||
|
prev_previous_name = previous_name
|
||||||
|
previous_type = type(module)
|
||||||
|
previous_name = name
|
234
AnyText/AnyText_scripts/ldm/modules/midas/midas/transforms.py
Normal file
@ -0,0 +1,234 @@
|
|||||||
|
import numpy as np
|
||||||
|
import cv2
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
|
||||||
|
"""Rezise the sample to ensure the given size. Keeps aspect ratio.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sample (dict): sample
|
||||||
|
size (tuple): image size
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: new size
|
||||||
|
"""
|
||||||
|
shape = list(sample["disparity"].shape)
|
||||||
|
|
||||||
|
if shape[0] >= size[0] and shape[1] >= size[1]:
|
||||||
|
return sample
|
||||||
|
|
||||||
|
scale = [0, 0]
|
||||||
|
scale[0] = size[0] / shape[0]
|
||||||
|
scale[1] = size[1] / shape[1]
|
||||||
|
|
||||||
|
scale = max(scale)
|
||||||
|
|
||||||
|
shape[0] = math.ceil(scale * shape[0])
|
||||||
|
shape[1] = math.ceil(scale * shape[1])
|
||||||
|
|
||||||
|
# resize
|
||||||
|
sample["image"] = cv2.resize(
|
||||||
|
sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
|
||||||
|
)
|
||||||
|
|
||||||
|
sample["disparity"] = cv2.resize(
|
||||||
|
sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
|
||||||
|
)
|
||||||
|
sample["mask"] = cv2.resize(
|
||||||
|
sample["mask"].astype(np.float32),
|
||||||
|
tuple(shape[::-1]),
|
||||||
|
interpolation=cv2.INTER_NEAREST,
|
||||||
|
)
|
||||||
|
sample["mask"] = sample["mask"].astype(bool)
|
||||||
|
|
||||||
|
return tuple(shape)
|
||||||
|
|
||||||
|
|
||||||
|
class Resize(object):
|
||||||
|
"""Resize sample to given size (width, height).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
resize_target=True,
|
||||||
|
keep_aspect_ratio=False,
|
||||||
|
ensure_multiple_of=1,
|
||||||
|
resize_method="lower_bound",
|
||||||
|
image_interpolation_method=cv2.INTER_AREA,
|
||||||
|
):
|
||||||
|
"""Init.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
width (int): desired output width
|
||||||
|
height (int): desired output height
|
||||||
|
resize_target (bool, optional):
|
||||||
|
True: Resize the full sample (image, mask, target).
|
||||||
|
False: Resize image only.
|
||||||
|
Defaults to True.
|
||||||
|
keep_aspect_ratio (bool, optional):
|
||||||
|
True: Keep the aspect ratio of the input sample.
|
||||||
|
Output sample might not have the given width and height, and
|
||||||
|
resize behaviour depends on the parameter 'resize_method'.
|
||||||
|
Defaults to False.
|
||||||
|
ensure_multiple_of (int, optional):
|
||||||
|
Output width and height is constrained to be multiple of this parameter.
|
||||||
|
Defaults to 1.
|
||||||
|
resize_method (str, optional):
|
||||||
|
"lower_bound": Output will be at least as large as the given size.
|
||||||
|
"upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
|
||||||
|
"minimal": Scale as least as possible. (Output size might be smaller than given size.)
|
||||||
|
Defaults to "lower_bound".
|
||||||
|
"""
|
||||||
|
self.__width = width
|
||||||
|
self.__height = height
|
||||||
|
|
||||||
|
self.__resize_target = resize_target
|
||||||
|
self.__keep_aspect_ratio = keep_aspect_ratio
|
||||||
|
self.__multiple_of = ensure_multiple_of
|
||||||
|
self.__resize_method = resize_method
|
||||||
|
self.__image_interpolation_method = image_interpolation_method
|
||||||
|
|
||||||
|
def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
|
||||||
|
y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
||||||
|
|
||||||
|
if max_val is not None and y > max_val:
|
||||||
|
y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
||||||
|
|
||||||
|
if y < min_val:
|
||||||
|
y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
|
||||||
|
|
||||||
|
return y
|
||||||
|
|
||||||
|
def get_size(self, width, height):
|
||||||
|
# determine new height and width
|
||||||
|
scale_height = self.__height / height
|
||||||
|
scale_width = self.__width / width
|
||||||
|
|
||||||
|
if self.__keep_aspect_ratio:
|
||||||
|
if self.__resize_method == "lower_bound":
|
||||||
|
# scale such that output size is lower bound
|
||||||
|
if scale_width > scale_height:
|
||||||
|
# fit width
|
||||||
|
scale_height = scale_width
|
||||||
|
else:
|
||||||
|
# fit height
|
||||||
|
scale_width = scale_height
|
||||||
|
elif self.__resize_method == "upper_bound":
|
||||||
|
# scale such that output size is upper bound
|
||||||
|
if scale_width < scale_height:
|
||||||
|
# fit width
|
||||||
|
scale_height = scale_width
|
||||||
|
else:
|
||||||
|
# fit height
|
||||||
|
scale_width = scale_height
|
||||||
|
elif self.__resize_method == "minimal":
|
||||||
|
# scale as least as possbile
|
||||||
|
if abs(1 - scale_width) < abs(1 - scale_height):
|
||||||
|
# fit width
|
||||||
|
scale_height = scale_width
|
||||||
|
else:
|
||||||
|
# fit height
|
||||||
|
scale_width = scale_height
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"resize_method {self.__resize_method} not implemented"
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.__resize_method == "lower_bound":
|
||||||
|
new_height = self.constrain_to_multiple_of(
|
||||||
|
scale_height * height, min_val=self.__height
|
||||||
|
)
|
||||||
|
new_width = self.constrain_to_multiple_of(
|
||||||
|
scale_width * width, min_val=self.__width
|
||||||
|
)
|
||||||
|
elif self.__resize_method == "upper_bound":
|
||||||
|
new_height = self.constrain_to_multiple_of(
|
||||||
|
scale_height * height, max_val=self.__height
|
||||||
|
)
|
||||||
|
new_width = self.constrain_to_multiple_of(
|
||||||
|
scale_width * width, max_val=self.__width
|
||||||
|
)
|
||||||
|
elif self.__resize_method == "minimal":
|
||||||
|
new_height = self.constrain_to_multiple_of(scale_height * height)
|
||||||
|
new_width = self.constrain_to_multiple_of(scale_width * width)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"resize_method {self.__resize_method} not implemented")
|
||||||
|
|
||||||
|
return (new_width, new_height)
|
||||||
|
|
||||||
|
def __call__(self, sample):
|
||||||
|
width, height = self.get_size(
|
||||||
|
sample["image"].shape[1], sample["image"].shape[0]
|
||||||
|
)
|
||||||
|
|
||||||
|
# resize sample
|
||||||
|
sample["image"] = cv2.resize(
|
||||||
|
sample["image"],
|
||||||
|
(width, height),
|
||||||
|
interpolation=self.__image_interpolation_method,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.__resize_target:
|
||||||
|
if "disparity" in sample:
|
||||||
|
sample["disparity"] = cv2.resize(
|
||||||
|
sample["disparity"],
|
||||||
|
(width, height),
|
||||||
|
interpolation=cv2.INTER_NEAREST,
|
||||||
|
)
|
||||||
|
|
||||||
|
if "depth" in sample:
|
||||||
|
sample["depth"] = cv2.resize(
|
||||||
|
sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
|
||||||
|
)
|
||||||
|
|
||||||
|
sample["mask"] = cv2.resize(
|
||||||
|
sample["mask"].astype(np.float32),
|
||||||
|
(width, height),
|
||||||
|
interpolation=cv2.INTER_NEAREST,
|
||||||
|
)
|
||||||
|
sample["mask"] = sample["mask"].astype(bool)
|
||||||
|
|
||||||
|
return sample
|
||||||
|
|
||||||
|
|
||||||
|
class NormalizeImage(object):
|
||||||
|
"""Normlize image by given mean and std.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, mean, std):
|
||||||
|
self.__mean = mean
|
||||||
|
self.__std = std
|
||||||
|
|
||||||
|
def __call__(self, sample):
|
||||||
|
sample["image"] = (sample["image"] - self.__mean) / self.__std
|
||||||
|
|
||||||
|
return sample
|
||||||
|
|
||||||
|
|
||||||
|
class PrepareForNet(object):
|
||||||
|
"""Prepare sample for usage as network input.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __call__(self, sample):
|
||||||
|
image = np.transpose(sample["image"], (2, 0, 1))
|
||||||
|
sample["image"] = np.ascontiguousarray(image).astype(np.float32)
|
||||||
|
|
||||||
|
if "mask" in sample:
|
||||||
|
sample["mask"] = sample["mask"].astype(np.float32)
|
||||||
|
sample["mask"] = np.ascontiguousarray(sample["mask"])
|
||||||
|
|
||||||
|
if "disparity" in sample:
|
||||||
|
disparity = sample["disparity"].astype(np.float32)
|
||||||
|
sample["disparity"] = np.ascontiguousarray(disparity)
|
||||||
|
|
||||||
|
if "depth" in sample:
|
||||||
|
depth = sample["depth"].astype(np.float32)
|
||||||
|
sample["depth"] = np.ascontiguousarray(depth)
|
||||||
|
|
||||||
|
return sample
|
491
AnyText/AnyText_scripts/ldm/modules/midas/midas/vit.py
Normal file
@ -0,0 +1,491 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import timm
|
||||||
|
import types
|
||||||
|
import math
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
class Slice(nn.Module):
|
||||||
|
def __init__(self, start_index=1):
|
||||||
|
super(Slice, self).__init__()
|
||||||
|
self.start_index = start_index
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return x[:, self.start_index :]
|
||||||
|
|
||||||
|
|
||||||
|
class AddReadout(nn.Module):
|
||||||
|
def __init__(self, start_index=1):
|
||||||
|
super(AddReadout, self).__init__()
|
||||||
|
self.start_index = start_index
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.start_index == 2:
|
||||||
|
readout = (x[:, 0] + x[:, 1]) / 2
|
||||||
|
else:
|
||||||
|
readout = x[:, 0]
|
||||||
|
return x[:, self.start_index :] + readout.unsqueeze(1)
|
||||||
|
|
||||||
|
|
||||||
|
class ProjectReadout(nn.Module):
|
||||||
|
def __init__(self, in_features, start_index=1):
|
||||||
|
super(ProjectReadout, self).__init__()
|
||||||
|
self.start_index = start_index
|
||||||
|
|
||||||
|
self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
|
||||||
|
features = torch.cat((x[:, self.start_index :], readout), -1)
|
||||||
|
|
||||||
|
return self.project(features)
|
||||||
|
|
||||||
|
|
||||||
|
class Transpose(nn.Module):
|
||||||
|
def __init__(self, dim0, dim1):
|
||||||
|
super(Transpose, self).__init__()
|
||||||
|
self.dim0 = dim0
|
||||||
|
self.dim1 = dim1
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = x.transpose(self.dim0, self.dim1)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def forward_vit(pretrained, x):
|
||||||
|
b, c, h, w = x.shape
|
||||||
|
|
||||||
|
glob = pretrained.model.forward_flex(x)
|
||||||
|
|
||||||
|
layer_1 = pretrained.activations["1"]
|
||||||
|
layer_2 = pretrained.activations["2"]
|
||||||
|
layer_3 = pretrained.activations["3"]
|
||||||
|
layer_4 = pretrained.activations["4"]
|
||||||
|
|
||||||
|
layer_1 = pretrained.act_postprocess1[0:2](layer_1)
|
||||||
|
layer_2 = pretrained.act_postprocess2[0:2](layer_2)
|
||||||
|
layer_3 = pretrained.act_postprocess3[0:2](layer_3)
|
||||||
|
layer_4 = pretrained.act_postprocess4[0:2](layer_4)
|
||||||
|
|
||||||
|
unflatten = nn.Sequential(
|
||||||
|
nn.Unflatten(
|
||||||
|
2,
|
||||||
|
torch.Size(
|
||||||
|
[
|
||||||
|
h // pretrained.model.patch_size[1],
|
||||||
|
w // pretrained.model.patch_size[0],
|
||||||
|
]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if layer_1.ndim == 3:
|
||||||
|
layer_1 = unflatten(layer_1)
|
||||||
|
if layer_2.ndim == 3:
|
||||||
|
layer_2 = unflatten(layer_2)
|
||||||
|
if layer_3.ndim == 3:
|
||||||
|
layer_3 = unflatten(layer_3)
|
||||||
|
if layer_4.ndim == 3:
|
||||||
|
layer_4 = unflatten(layer_4)
|
||||||
|
|
||||||
|
layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
|
||||||
|
layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
|
||||||
|
layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
|
||||||
|
layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
|
||||||
|
|
||||||
|
return layer_1, layer_2, layer_3, layer_4
|
||||||
|
|
||||||
|
|
||||||
|
def _resize_pos_embed(self, posemb, gs_h, gs_w):
|
||||||
|
posemb_tok, posemb_grid = (
|
||||||
|
posemb[:, : self.start_index],
|
||||||
|
posemb[0, self.start_index :],
|
||||||
|
)
|
||||||
|
|
||||||
|
gs_old = int(math.sqrt(len(posemb_grid)))
|
||||||
|
|
||||||
|
posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
|
||||||
|
posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
|
||||||
|
posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
|
||||||
|
|
||||||
|
posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
|
||||||
|
|
||||||
|
return posemb
|
||||||
|
|
||||||
|
|
||||||
|
def forward_flex(self, x):
|
||||||
|
b, c, h, w = x.shape
|
||||||
|
|
||||||
|
pos_embed = self._resize_pos_embed(
|
||||||
|
self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
|
||||||
|
)
|
||||||
|
|
||||||
|
B = x.shape[0]
|
||||||
|
|
||||||
|
if hasattr(self.patch_embed, "backbone"):
|
||||||
|
x = self.patch_embed.backbone(x)
|
||||||
|
if isinstance(x, (list, tuple)):
|
||||||
|
x = x[-1] # last feature if backbone outputs list/tuple of features
|
||||||
|
|
||||||
|
x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
|
||||||
|
|
||||||
|
if getattr(self, "dist_token", None) is not None:
|
||||||
|
cls_tokens = self.cls_token.expand(
|
||||||
|
B, -1, -1
|
||||||
|
) # stole cls_tokens impl from Phil Wang, thanks
|
||||||
|
dist_token = self.dist_token.expand(B, -1, -1)
|
||||||
|
x = torch.cat((cls_tokens, dist_token, x), dim=1)
|
||||||
|
else:
|
||||||
|
cls_tokens = self.cls_token.expand(
|
||||||
|
B, -1, -1
|
||||||
|
) # stole cls_tokens impl from Phil Wang, thanks
|
||||||
|
x = torch.cat((cls_tokens, x), dim=1)
|
||||||
|
|
||||||
|
x = x + pos_embed
|
||||||
|
x = self.pos_drop(x)
|
||||||
|
|
||||||
|
for blk in self.blocks:
|
||||||
|
x = blk(x)
|
||||||
|
|
||||||
|
x = self.norm(x)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
activations = {}
|
||||||
|
|
||||||
|
|
||||||
|
def get_activation(name):
|
||||||
|
def hook(model, input, output):
|
||||||
|
activations[name] = output
|
||||||
|
|
||||||
|
return hook
|
||||||
|
|
||||||
|
|
||||||
|
def get_readout_oper(vit_features, features, use_readout, start_index=1):
|
||||||
|
if use_readout == "ignore":
|
||||||
|
readout_oper = [Slice(start_index)] * len(features)
|
||||||
|
elif use_readout == "add":
|
||||||
|
readout_oper = [AddReadout(start_index)] * len(features)
|
||||||
|
elif use_readout == "project":
|
||||||
|
readout_oper = [
|
||||||
|
ProjectReadout(vit_features, start_index) for out_feat in features
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
assert (
|
||||||
|
False
|
||||||
|
), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
|
||||||
|
|
||||||
|
return readout_oper
|
||||||
|
|
||||||
|
|
||||||
|
def _make_vit_b16_backbone(
|
||||||
|
model,
|
||||||
|
features=[96, 192, 384, 768],
|
||||||
|
size=[384, 384],
|
||||||
|
hooks=[2, 5, 8, 11],
|
||||||
|
vit_features=768,
|
||||||
|
use_readout="ignore",
|
||||||
|
start_index=1,
|
||||||
|
):
|
||||||
|
pretrained = nn.Module()
|
||||||
|
|
||||||
|
pretrained.model = model
|
||||||
|
pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
|
||||||
|
pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
|
||||||
|
pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
|
||||||
|
pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
|
||||||
|
|
||||||
|
pretrained.activations = activations
|
||||||
|
|
||||||
|
readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
|
||||||
|
|
||||||
|
# 32, 48, 136, 384
|
||||||
|
pretrained.act_postprocess1 = nn.Sequential(
|
||||||
|
readout_oper[0],
|
||||||
|
Transpose(1, 2),
|
||||||
|
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=vit_features,
|
||||||
|
out_channels=features[0],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
),
|
||||||
|
nn.ConvTranspose2d(
|
||||||
|
in_channels=features[0],
|
||||||
|
out_channels=features[0],
|
||||||
|
kernel_size=4,
|
||||||
|
stride=4,
|
||||||
|
padding=0,
|
||||||
|
bias=True,
|
||||||
|
dilation=1,
|
||||||
|
groups=1,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
pretrained.act_postprocess2 = nn.Sequential(
|
||||||
|
readout_oper[1],
|
||||||
|
Transpose(1, 2),
|
||||||
|
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=vit_features,
|
||||||
|
out_channels=features[1],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
),
|
||||||
|
nn.ConvTranspose2d(
|
||||||
|
in_channels=features[1],
|
||||||
|
out_channels=features[1],
|
||||||
|
kernel_size=2,
|
||||||
|
stride=2,
|
||||||
|
padding=0,
|
||||||
|
bias=True,
|
||||||
|
dilation=1,
|
||||||
|
groups=1,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
pretrained.act_postprocess3 = nn.Sequential(
|
||||||
|
readout_oper[2],
|
||||||
|
Transpose(1, 2),
|
||||||
|
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=vit_features,
|
||||||
|
out_channels=features[2],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
pretrained.act_postprocess4 = nn.Sequential(
|
||||||
|
readout_oper[3],
|
||||||
|
Transpose(1, 2),
|
||||||
|
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=vit_features,
|
||||||
|
out_channels=features[3],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
),
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=features[3],
|
||||||
|
out_channels=features[3],
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
pretrained.model.start_index = start_index
|
||||||
|
pretrained.model.patch_size = [16, 16]
|
||||||
|
|
||||||
|
# We inject this function into the VisionTransformer instances so that
|
||||||
|
# we can use it with interpolated position embeddings without modifying the library source.
|
||||||
|
pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
|
||||||
|
pretrained.model._resize_pos_embed = types.MethodType(
|
||||||
|
_resize_pos_embed, pretrained.model
|
||||||
|
)
|
||||||
|
|
||||||
|
return pretrained
|
||||||
|
|
||||||
|
|
||||||
|
def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
|
||||||
|
model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
|
||||||
|
|
||||||
|
hooks = [5, 11, 17, 23] if hooks == None else hooks
|
||||||
|
return _make_vit_b16_backbone(
|
||||||
|
model,
|
||||||
|
features=[256, 512, 1024, 1024],
|
||||||
|
hooks=hooks,
|
||||||
|
vit_features=1024,
|
||||||
|
use_readout=use_readout,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
|
||||||
|
model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
|
||||||
|
|
||||||
|
hooks = [2, 5, 8, 11] if hooks == None else hooks
|
||||||
|
return _make_vit_b16_backbone(
|
||||||
|
model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None):
|
||||||
|
model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
|
||||||
|
|
||||||
|
hooks = [2, 5, 8, 11] if hooks == None else hooks
|
||||||
|
return _make_vit_b16_backbone(
|
||||||
|
model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None):
|
||||||
|
model = timm.create_model(
|
||||||
|
"vit_deit_base_distilled_patch16_384", pretrained=pretrained
|
||||||
|
)
|
||||||
|
|
||||||
|
hooks = [2, 5, 8, 11] if hooks == None else hooks
|
||||||
|
return _make_vit_b16_backbone(
|
||||||
|
model,
|
||||||
|
features=[96, 192, 384, 768],
|
||||||
|
hooks=hooks,
|
||||||
|
use_readout=use_readout,
|
||||||
|
start_index=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_vit_b_rn50_backbone(
|
||||||
|
model,
|
||||||
|
features=[256, 512, 768, 768],
|
||||||
|
size=[384, 384],
|
||||||
|
hooks=[0, 1, 8, 11],
|
||||||
|
vit_features=768,
|
||||||
|
use_vit_only=False,
|
||||||
|
use_readout="ignore",
|
||||||
|
start_index=1,
|
||||||
|
):
|
||||||
|
pretrained = nn.Module()
|
||||||
|
|
||||||
|
pretrained.model = model
|
||||||
|
|
||||||
|
if use_vit_only == True:
|
||||||
|
pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
|
||||||
|
pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
|
||||||
|
else:
|
||||||
|
pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
|
||||||
|
get_activation("1")
|
||||||
|
)
|
||||||
|
pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
|
||||||
|
get_activation("2")
|
||||||
|
)
|
||||||
|
|
||||||
|
pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
|
||||||
|
pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
|
||||||
|
|
||||||
|
pretrained.activations = activations
|
||||||
|
|
||||||
|
readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
|
||||||
|
|
||||||
|
if use_vit_only == True:
|
||||||
|
pretrained.act_postprocess1 = nn.Sequential(
|
||||||
|
readout_oper[0],
|
||||||
|
Transpose(1, 2),
|
||||||
|
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=vit_features,
|
||||||
|
out_channels=features[0],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
),
|
||||||
|
nn.ConvTranspose2d(
|
||||||
|
in_channels=features[0],
|
||||||
|
out_channels=features[0],
|
||||||
|
kernel_size=4,
|
||||||
|
stride=4,
|
||||||
|
padding=0,
|
||||||
|
bias=True,
|
||||||
|
dilation=1,
|
||||||
|
groups=1,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
pretrained.act_postprocess2 = nn.Sequential(
|
||||||
|
readout_oper[1],
|
||||||
|
Transpose(1, 2),
|
||||||
|
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=vit_features,
|
||||||
|
out_channels=features[1],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
),
|
||||||
|
nn.ConvTranspose2d(
|
||||||
|
in_channels=features[1],
|
||||||
|
out_channels=features[1],
|
||||||
|
kernel_size=2,
|
||||||
|
stride=2,
|
||||||
|
padding=0,
|
||||||
|
bias=True,
|
||||||
|
dilation=1,
|
||||||
|
groups=1,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
pretrained.act_postprocess1 = nn.Sequential(
|
||||||
|
nn.Identity(), nn.Identity(), nn.Identity()
|
||||||
|
)
|
||||||
|
pretrained.act_postprocess2 = nn.Sequential(
|
||||||
|
nn.Identity(), nn.Identity(), nn.Identity()
|
||||||
|
)
|
||||||
|
|
||||||
|
pretrained.act_postprocess3 = nn.Sequential(
|
||||||
|
readout_oper[2],
|
||||||
|
Transpose(1, 2),
|
||||||
|
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=vit_features,
|
||||||
|
out_channels=features[2],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
pretrained.act_postprocess4 = nn.Sequential(
|
||||||
|
readout_oper[3],
|
||||||
|
Transpose(1, 2),
|
||||||
|
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=vit_features,
|
||||||
|
out_channels=features[3],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
),
|
||||||
|
nn.Conv2d(
|
||||||
|
in_channels=features[3],
|
||||||
|
out_channels=features[3],
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
pretrained.model.start_index = start_index
|
||||||
|
pretrained.model.patch_size = [16, 16]
|
||||||
|
|
||||||
|
# We inject this function into the VisionTransformer instances so that
|
||||||
|
# we can use it with interpolated position embeddings without modifying the library source.
|
||||||
|
pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
|
||||||
|
|
||||||
|
# We inject this function into the VisionTransformer instances so that
|
||||||
|
# we can use it with interpolated position embeddings without modifying the library source.
|
||||||
|
pretrained.model._resize_pos_embed = types.MethodType(
|
||||||
|
_resize_pos_embed, pretrained.model
|
||||||
|
)
|
||||||
|
|
||||||
|
return pretrained
|
||||||
|
|
||||||
|
|
||||||
|
def _make_pretrained_vitb_rn50_384(
|
||||||
|
pretrained, use_readout="ignore", hooks=None, use_vit_only=False
|
||||||
|
):
|
||||||
|
model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
|
||||||
|
|
||||||
|
hooks = [0, 1, 8, 11] if hooks == None else hooks
|
||||||
|
return _make_vit_b_rn50_backbone(
|
||||||
|
model,
|
||||||
|
features=[256, 512, 768, 768],
|
||||||
|
size=[384, 384],
|
||||||
|
hooks=hooks,
|
||||||
|
use_vit_only=use_vit_only,
|
||||||
|
use_readout=use_readout,
|
||||||
|
)
|
189
AnyText/AnyText_scripts/ldm/modules/midas/utils.py
Normal file
@ -0,0 +1,189 @@
|
|||||||
|
"""Utils for monoDepth."""
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
import cv2
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def read_pfm(path):
|
||||||
|
"""Read pfm file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): path to file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (data, scale)
|
||||||
|
"""
|
||||||
|
with open(path, "rb") as file:
|
||||||
|
|
||||||
|
color = None
|
||||||
|
width = None
|
||||||
|
height = None
|
||||||
|
scale = None
|
||||||
|
endian = None
|
||||||
|
|
||||||
|
header = file.readline().rstrip()
|
||||||
|
if header.decode("ascii") == "PF":
|
||||||
|
color = True
|
||||||
|
elif header.decode("ascii") == "Pf":
|
||||||
|
color = False
|
||||||
|
else:
|
||||||
|
raise Exception("Not a PFM file: " + path)
|
||||||
|
|
||||||
|
dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
|
||||||
|
if dim_match:
|
||||||
|
width, height = list(map(int, dim_match.groups()))
|
||||||
|
else:
|
||||||
|
raise Exception("Malformed PFM header.")
|
||||||
|
|
||||||
|
scale = float(file.readline().decode("ascii").rstrip())
|
||||||
|
if scale < 0:
|
||||||
|
# little-endian
|
||||||
|
endian = "<"
|
||||||
|
scale = -scale
|
||||||
|
else:
|
||||||
|
# big-endian
|
||||||
|
endian = ">"
|
||||||
|
|
||||||
|
data = np.fromfile(file, endian + "f")
|
||||||
|
shape = (height, width, 3) if color else (height, width)
|
||||||
|
|
||||||
|
data = np.reshape(data, shape)
|
||||||
|
data = np.flipud(data)
|
||||||
|
|
||||||
|
return data, scale
|
||||||
|
|
||||||
|
|
||||||
|
def write_pfm(path, image, scale=1):
|
||||||
|
"""Write pfm file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): pathto file
|
||||||
|
image (array): data
|
||||||
|
scale (int, optional): Scale. Defaults to 1.
|
||||||
|
"""
|
||||||
|
|
||||||
|
with open(path, "wb") as file:
|
||||||
|
color = None
|
||||||
|
|
||||||
|
if image.dtype.name != "float32":
|
||||||
|
raise Exception("Image dtype must be float32.")
|
||||||
|
|
||||||
|
image = np.flipud(image)
|
||||||
|
|
||||||
|
if len(image.shape) == 3 and image.shape[2] == 3: # color image
|
||||||
|
color = True
|
||||||
|
elif (
|
||||||
|
len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
|
||||||
|
): # greyscale
|
||||||
|
color = False
|
||||||
|
else:
|
||||||
|
raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
|
||||||
|
|
||||||
|
file.write("PF\n" if color else "Pf\n".encode())
|
||||||
|
file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
|
||||||
|
|
||||||
|
endian = image.dtype.byteorder
|
||||||
|
|
||||||
|
if endian == "<" or endian == "=" and sys.byteorder == "little":
|
||||||
|
scale = -scale
|
||||||
|
|
||||||
|
file.write("%f\n".encode() % scale)
|
||||||
|
|
||||||
|
image.tofile(file)
|
||||||
|
|
||||||
|
|
||||||
|
def read_image(path):
|
||||||
|
"""Read image and output RGB image (0-1).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): path to file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
array: RGB image (0-1)
|
||||||
|
"""
|
||||||
|
img = cv2.imread(path)
|
||||||
|
|
||||||
|
if img.ndim == 2:
|
||||||
|
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||||
|
|
||||||
|
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
|
||||||
|
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def resize_image(img):
|
||||||
|
"""Resize image and make it fit for network.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img (array): image
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tensor: data ready for network
|
||||||
|
"""
|
||||||
|
height_orig = img.shape[0]
|
||||||
|
width_orig = img.shape[1]
|
||||||
|
|
||||||
|
if width_orig > height_orig:
|
||||||
|
scale = width_orig / 384
|
||||||
|
else:
|
||||||
|
scale = height_orig / 384
|
||||||
|
|
||||||
|
height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
|
||||||
|
width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
|
||||||
|
|
||||||
|
img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
|
||||||
|
|
||||||
|
img_resized = (
|
||||||
|
torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
|
||||||
|
)
|
||||||
|
img_resized = img_resized.unsqueeze(0)
|
||||||
|
|
||||||
|
return img_resized
|
||||||
|
|
||||||
|
|
||||||
|
def resize_depth(depth, width, height):
|
||||||
|
"""Resize depth map and bring to CPU (numpy).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
depth (tensor): depth
|
||||||
|
width (int): image width
|
||||||
|
height (int): image height
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
array: processed depth
|
||||||
|
"""
|
||||||
|
depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
|
||||||
|
|
||||||
|
depth_resized = cv2.resize(
|
||||||
|
depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
|
||||||
|
)
|
||||||
|
|
||||||
|
return depth_resized
|
||||||
|
|
||||||
|
def write_depth(path, depth, bits=1):
|
||||||
|
"""Write depth map to pfm and png file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): filepath without extension
|
||||||
|
depth (array): depth
|
||||||
|
"""
|
||||||
|
write_pfm(path + ".pfm", depth.astype(np.float32))
|
||||||
|
|
||||||
|
depth_min = depth.min()
|
||||||
|
depth_max = depth.max()
|
||||||
|
|
||||||
|
max_val = (2**(8*bits))-1
|
||||||
|
|
||||||
|
if depth_max - depth_min > np.finfo("float").eps:
|
||||||
|
out = max_val * (depth - depth_min) / (depth_max - depth_min)
|
||||||
|
else:
|
||||||
|
out = np.zeros(depth.shape, dtype=depth.type)
|
||||||
|
|
||||||
|
if bits == 1:
|
||||||
|
cv2.imwrite(path + ".png", out.astype("uint8"))
|
||||||
|
elif bits == 2:
|
||||||
|
cv2.imwrite(path + ".png", out.astype("uint16"))
|
||||||
|
|
||||||
|
return
|
197
AnyText/AnyText_scripts/ldm/util.py
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
import importlib
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch import optim
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from inspect import isfunction
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
|
|
||||||
|
def log_txt_as_img(wh, xc, size=10):
|
||||||
|
# wh a tuple of (width, height)
|
||||||
|
# xc a list of captions to plot
|
||||||
|
b = len(xc)
|
||||||
|
txts = list()
|
||||||
|
for bi in range(b):
|
||||||
|
txt = Image.new("RGB", wh, color="white")
|
||||||
|
draw = ImageDraw.Draw(txt)
|
||||||
|
font = ImageFont.truetype('font/Arial_Unicode.ttf', size=size)
|
||||||
|
nc = int(32 * (wh[0] / 256))
|
||||||
|
lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
|
||||||
|
|
||||||
|
try:
|
||||||
|
draw.text((0, 0), lines, fill="black", font=font)
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
print("Cant encode string for logging. Skipping.")
|
||||||
|
|
||||||
|
txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
|
||||||
|
txts.append(txt)
|
||||||
|
txts = np.stack(txts)
|
||||||
|
txts = torch.tensor(txts)
|
||||||
|
return txts
|
||||||
|
|
||||||
|
|
||||||
|
def ismap(x):
|
||||||
|
if not isinstance(x, torch.Tensor):
|
||||||
|
return False
|
||||||
|
return (len(x.shape) == 4) and (x.shape[1] > 3)
|
||||||
|
|
||||||
|
|
||||||
|
def isimage(x):
|
||||||
|
if not isinstance(x,torch.Tensor):
|
||||||
|
return False
|
||||||
|
return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
|
||||||
|
|
||||||
|
|
||||||
|
def exists(x):
|
||||||
|
return x is not None
|
||||||
|
|
||||||
|
|
||||||
|
def default(val, d):
|
||||||
|
if exists(val):
|
||||||
|
return val
|
||||||
|
return d() if isfunction(d) else d
|
||||||
|
|
||||||
|
|
||||||
|
def mean_flat(tensor):
|
||||||
|
"""
|
||||||
|
https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
|
||||||
|
Take the mean over all non-batch dimensions.
|
||||||
|
"""
|
||||||
|
return tensor.mean(dim=list(range(1, len(tensor.shape))))
|
||||||
|
|
||||||
|
|
||||||
|
def count_params(model, verbose=False):
|
||||||
|
total_params = sum(p.numel() for p in model.parameters())
|
||||||
|
if verbose:
|
||||||
|
print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
|
||||||
|
return total_params
|
||||||
|
|
||||||
|
|
||||||
|
def instantiate_from_config(config, **kwargs):
|
||||||
|
if "target" not in config:
|
||||||
|
if config == '__is_first_stage__':
|
||||||
|
return None
|
||||||
|
elif config == "__is_unconditional__":
|
||||||
|
return None
|
||||||
|
raise KeyError("Expected key `target` to instantiate.")
|
||||||
|
return get_obj_from_str(config["target"])(**config.get("params", dict()), **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def get_obj_from_str(string, reload=False):
|
||||||
|
module, cls = string.rsplit(".", 1)
|
||||||
|
if reload:
|
||||||
|
module_imp = importlib.import_module(module)
|
||||||
|
importlib.reload(module_imp)
|
||||||
|
return getattr(importlib.import_module(module, package=None), cls)
|
||||||
|
|
||||||
|
|
||||||
|
class AdamWwithEMAandWings(optim.Optimizer):
|
||||||
|
# credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298
|
||||||
|
def __init__(self, params, lr=1.e-3, betas=(0.9, 0.999), eps=1.e-8, # TODO: check hyperparameters before using
|
||||||
|
weight_decay=1.e-2, amsgrad=False, ema_decay=0.9999, # ema decay to match previous code
|
||||||
|
ema_power=1., param_names=()):
|
||||||
|
"""AdamW that saves EMA versions of the parameters."""
|
||||||
|
if not 0.0 <= lr:
|
||||||
|
raise ValueError("Invalid learning rate: {}".format(lr))
|
||||||
|
if not 0.0 <= eps:
|
||||||
|
raise ValueError("Invalid epsilon value: {}".format(eps))
|
||||||
|
if not 0.0 <= betas[0] < 1.0:
|
||||||
|
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
|
||||||
|
if not 0.0 <= betas[1] < 1.0:
|
||||||
|
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
|
||||||
|
if not 0.0 <= weight_decay:
|
||||||
|
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
|
||||||
|
if not 0.0 <= ema_decay <= 1.0:
|
||||||
|
raise ValueError("Invalid ema_decay value: {}".format(ema_decay))
|
||||||
|
defaults = dict(lr=lr, betas=betas, eps=eps,
|
||||||
|
weight_decay=weight_decay, amsgrad=amsgrad, ema_decay=ema_decay,
|
||||||
|
ema_power=ema_power, param_names=param_names)
|
||||||
|
super().__init__(params, defaults)
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
super().__setstate__(state)
|
||||||
|
for group in self.param_groups:
|
||||||
|
group.setdefault('amsgrad', False)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def step(self, closure=None):
|
||||||
|
"""Performs a single optimization step.
|
||||||
|
Args:
|
||||||
|
closure (callable, optional): A closure that reevaluates the model
|
||||||
|
and returns the loss.
|
||||||
|
"""
|
||||||
|
loss = None
|
||||||
|
if closure is not None:
|
||||||
|
with torch.enable_grad():
|
||||||
|
loss = closure()
|
||||||
|
|
||||||
|
for group in self.param_groups:
|
||||||
|
params_with_grad = []
|
||||||
|
grads = []
|
||||||
|
exp_avgs = []
|
||||||
|
exp_avg_sqs = []
|
||||||
|
ema_params_with_grad = []
|
||||||
|
state_sums = []
|
||||||
|
max_exp_avg_sqs = []
|
||||||
|
state_steps = []
|
||||||
|
amsgrad = group['amsgrad']
|
||||||
|
beta1, beta2 = group['betas']
|
||||||
|
ema_decay = group['ema_decay']
|
||||||
|
ema_power = group['ema_power']
|
||||||
|
|
||||||
|
for p in group['params']:
|
||||||
|
if p.grad is None:
|
||||||
|
continue
|
||||||
|
params_with_grad.append(p)
|
||||||
|
if p.grad.is_sparse:
|
||||||
|
raise RuntimeError('AdamW does not support sparse gradients')
|
||||||
|
grads.append(p.grad)
|
||||||
|
|
||||||
|
state = self.state[p]
|
||||||
|
|
||||||
|
# State initialization
|
||||||
|
if len(state) == 0:
|
||||||
|
state['step'] = 0
|
||||||
|
# Exponential moving average of gradient values
|
||||||
|
state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
|
||||||
|
# Exponential moving average of squared gradient values
|
||||||
|
state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
|
||||||
|
if amsgrad:
|
||||||
|
# Maintains max of all exp. moving avg. of sq. grad. values
|
||||||
|
state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
|
||||||
|
# Exponential moving average of parameter values
|
||||||
|
state['param_exp_avg'] = p.detach().float().clone()
|
||||||
|
|
||||||
|
exp_avgs.append(state['exp_avg'])
|
||||||
|
exp_avg_sqs.append(state['exp_avg_sq'])
|
||||||
|
ema_params_with_grad.append(state['param_exp_avg'])
|
||||||
|
|
||||||
|
if amsgrad:
|
||||||
|
max_exp_avg_sqs.append(state['max_exp_avg_sq'])
|
||||||
|
|
||||||
|
# update the steps for each param group update
|
||||||
|
state['step'] += 1
|
||||||
|
# record the step after step update
|
||||||
|
state_steps.append(state['step'])
|
||||||
|
|
||||||
|
optim._functional.adamw(params_with_grad,
|
||||||
|
grads,
|
||||||
|
exp_avgs,
|
||||||
|
exp_avg_sqs,
|
||||||
|
max_exp_avg_sqs,
|
||||||
|
state_steps,
|
||||||
|
amsgrad=amsgrad,
|
||||||
|
beta1=beta1,
|
||||||
|
beta2=beta2,
|
||||||
|
lr=group['lr'],
|
||||||
|
weight_decay=group['weight_decay'],
|
||||||
|
eps=group['eps'],
|
||||||
|
maximize=False)
|
||||||
|
|
||||||
|
cur_ema_decay = min(ema_decay, 1 - state['step'] ** -ema_power)
|
||||||
|
for param, ema_param in zip(params_with_grad, ema_params_with_grad):
|
||||||
|
ema_param.mul_(cur_ema_decay).add_(param.float(), alpha=1 - cur_ema_decay)
|
||||||
|
|
||||||
|
return loss
|
BIN
AnyText/assets/AnyText-wf.png
Normal file
After Width: | Height: | Size: 905 KiB |
103
AnyText/assets/README-Zh-CN.md
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
# AnyText非官方的简单粗糙实现 | [English README](../../README.md)
|
||||||
|
|
||||||
|
## 原Repo: [AnyText: Multilingual Visual Text Generation And Editing](https://github.com/tyxsspa/AnyText)
|
||||||
|
|
||||||
|
## 个人原因暂停维护。
|
||||||
|
|
||||||
|
## 警告:
|
||||||
|
- 非程序员,所以很多问题我都没办法解决。
|
||||||
|
- **如果不需要`damo/nlp_csanmt_translation_zh2en`翻译,不要安装modelscope、tensorflow包!!!**
|
||||||
|
- 这个插件生成质量可能比官方差很多。
|
||||||
|
- 仅测试 **cuda+fp16/fp32** ,其他搭配自行测试。
|
||||||
|
- 仅在**ComfyUI官方整合包稳定版**+**绿色便携(python_embed)**+**windows测试**,第三方整合包、虚拟环境和其他操作系统(例如linux)自行测试,无法保证正常使用。
|
||||||
|
- Tensorflow需要特定版本cuda才能跑到gpu上,但是在原生windows上 [tensorflow 2.10+:详情看note](https://github.com/tensorflow/tensorflow/releases/tag/v2.11.1) 无法调用gpu,必须使用linux或者wsl2才行。这种情况下`damo/nlp_csanmt_translation_zh2en`翻译只能跑在cpu上,速度很慢。
|
||||||
|
- 如果出现`Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same`错误,打开**all_to_device**,也许有效。感谢 **@[602387193c](https://github.com/602387193c)** -----> **[issues/17](https://github.com/zmwv823/ComfyUI-AnyText/issues/17)**
|
||||||
|
- 如果出现`expected scalar type Half but found Float`,尝试fp32。
|
||||||
|
### v2测试,更加本地化,非远程代码模式。
|
||||||
|
|
||||||
|
## 使用说明:
|
||||||
|
- `utrobinmv/t5_translate_en_ru_zh_small_1024`(212MB)翻译速度快、体积小,但是准确度远不如`damo/nlp_csanmt_translation_zh2en`(7.3GB)。
|
||||||
|
- 自动检测提示词是否中文,来决定是否自动加载翻译。
|
||||||
|
- 手绘遮罩数量必须>=你想生成文字的数量,每一个“”代表一个文字数量,“”里面内容不限长度,否则会报错 ["not enough values to unpack"](https://github.com/zmwv823/ComfyUI-AnyText/issues/7).
|
||||||
|
- 个人电脑环境: ComfyUI官方整合包+(ComfyUI_windows_portable\ComfyUI下)脚本运行+python_embed+win10+py311+torch2.3.0+cu121+rtx3050laptop(4GB).
|
||||||
|
- pillow>=9.5.0(10.3.0)大部分包都是最新版本。
|
||||||
|
- **支持任意分辨率图片输入,但是会被缩放到<=768,输出图片也会被限制到<=768(官方策略)。**
|
||||||
|
- **如果font、ckpt_name、clip设置为Auto_DownLoad,则会自动下载默认模型到特定目录。如果模型已经存在,则会自动加载。**
|
||||||
|
- 自动从笑脸(huggingface)下载的AnyText模型(fp16: 2.66 GB)在"ComfyUI\models\checkpoints\15\anytext_v1.1.safetensors"。
|
||||||
|
- 你可以手动从[魔搭(modelscope)-AnyText-FP32-5.73 GB](https://modelscope.cn/models/iic/cv_anytext_text_generation_editing/file/view/master?fileName=anytext_v1.1.ckpt&status=2)下载,然后放到**ComfyUI\models\checkpoints**。
|
||||||
|
- 或者手动从[笑脸(huggingface)-AnyText-FP16-2.66 GB](https://huggingface.co/Sanster/AnyText/blob/main/pytorch_model.fp16.safetensors) 下载并重命名为**anytext_v1.1.safetensors或者任意名字**。然后放到 **ComfyUI\models\checkpoints**。
|
||||||
|
- [clip模型-**clip-vit-large-patch14**](https://huggingface.co/openai/clip-vit-large-patch14)会下载到 `C:\Users\username\.cache\huggingface\hub`。可以手动下载[clip模型](https://huggingface.co/openai/clip-vit-large-patch14)放到**ComfyUI\models\clip\openai--clip-vit-large-patch14**位置。
|
||||||
|
- ![](./clip_model.jpg)
|
||||||
|
- [字体-(SourceHanSansSC-Medium.otf)-18MB](https://huggingface.co/Sanster/AnyText/blob/main/SourceHanSansSC-Medium.otf)会从笑脸(huggingface)下载到**ComfyUI\models\fonts**位置,你也可以使用自己的字体。
|
||||||
|
- 翻译模型会自动从[笑脸huggingface--utrobinmv/t5_translate_en_ru_zh_small_1024](https://huggingface.co/utrobinmv/t5_translate_en_ru_zh_small_1024--212MB)下载到`C:\Users\username\.cache\huggingface\hub`或者 [魔搭modelscope--damo\nlp_csanmt_translation_zh2en--7.3GB](https://www.modelscope.cn/models/iic/nlp_csanmt_translation_zh2en)下载到`C:\Users\username\.cache\modelscope\hub\damo`位置。可以手动从前面链接下载,然后把所有文件放到`ComfyUI\models\prompt_generator\models--utrobinmv--t5_translate_en_ru_zh_small_1024`或者`ComfyUI\models\prompt_generator\nlp_csanmt_translation_zh2en`
|
||||||
|
- ![](./zh2en_model.jpg)
|
||||||
|
- **AnyText模型本身是一个标准的sd1.5文生图模型。**
|
||||||
|
|
||||||
|
## 示例提示词:
|
||||||
|
### 文本生成英文提示词:
|
||||||
|
- An exquisite mug with an ancient Chinese poem engraved on it, including “花落知多少” and “夜来风雨声” and “处处闻啼鸟” and “春眠不觉晓”
|
||||||
|
- Sign on the clean building that reads “科学” and "과학" and "ステップ" and "SCIENCE"
|
||||||
|
- An ice sculpture is made with the text "Happy" and "Holidays".Dslr photo.
|
||||||
|
- A baseball cap with words “要聪明地” and “全力以赴”
|
||||||
|
- A nice drawing of octopus, sharks, and boats made by a child with crayons, with the words “神奇海底世界”
|
||||||
|
### 文本编辑英文提示词
|
||||||
|
- A Minion meme that says "wrong"
|
||||||
|
- A pile of fruit with "UIT" written in the middle
|
||||||
|
- photo of clean sandy beach," " " "
|
||||||
|
### 文本生成中文提示词:
|
||||||
|
- 一个儿童蜡笔画,森林里有一个可爱的蘑菇形状的房子,标题是"森林小屋"
|
||||||
|
- 一个精美设计的logo,画的是一个黑白风格的厨师,带着厨师帽,logo下方写着“深夜食堂”
|
||||||
|
- 一张户外雪地靴的电商广告,上面写着 “双12大促!”,“立减50”,“加绒加厚”,“穿脱方便”,“温暖24小时送达”, “包邮”,高级设计感,精美构图
|
||||||
|
- 一个精致的马克杯,上面雕刻着一首中国古诗,内容是 "花落知多少" "夜来风雨声" "处处闻啼鸟" "春眠不觉晓"
|
||||||
|
- 一个漂亮的蜡笔画,有行星,宇航员,还有宇宙飞船,上面写的是"去火星旅行", "王小明", "11月1日"
|
||||||
|
- 一个装饰华丽的蛋糕,上面用奶油写着“阿里云”和"APSARA"
|
||||||
|
- 一张关于墙上的彩色涂鸦艺术的摄影作品,上面写着“人工智能" 和 "神经网络"
|
||||||
|
- 一枚中国古代铜钱, 上面的文字是 "康" "寶" "通" "熙"
|
||||||
|
- 精美的书法作品,上面写着“志” “存” “高” “远”
|
||||||
|
### 文本编辑中文提示词:
|
||||||
|
- 一个表情包,小猪说 "下班"
|
||||||
|
- 一个中国古代铜钱,上面写着"乾" "隆"
|
||||||
|
- 一个黄色标志牌,上边写着"不要" 和 "大意"
|
||||||
|
- 一个建筑物前面的字母标牌, 上面写着 " "
|
||||||
|
## 示例工作流:
|
||||||
|
|
||||||
|
![workflow](./AnyText-wf.png)
|
||||||
|
|
||||||
|
## 部分参数:
|
||||||
|
|
||||||
|
### sort_radio: 位置排序,位置排序时的优先级。
|
||||||
|
|
||||||
|
- ↕代表Y轴,这个选项会按照遮罩(mask)位置从上到下生成,提示词里面的从开始到结束顺序的字符串(""内的内容)。
|
||||||
|
- ↔代表X轴,这个选项会按照遮罩(mask)位置从左到右生成,提示词里面的从开始到结束顺序的字符串(""内的内容)。
|
||||||
|
|
||||||
|
### revise_pose: 修正位置(仅text-generation模式生效)。
|
||||||
|
|
||||||
|
- 尝试通过渲染后的文字行的外接矩形框修正位置,但是这个选项对生成的图片创造性有一定影响。
|
||||||
|
|
||||||
|
### Random_Gen: 自动生成随机位置遮罩。
|
||||||
|
|
||||||
|
- 根据提示词内字符串数量自动生成遮罩,启用这个选项时手动绘制的遮罩图不生效。
|
||||||
|
|
||||||
|
### nonEdit_random_gen_width & nonEdit_random_gen_height:
|
||||||
|
|
||||||
|
- 当**text-generation和Random_Gen**一起使用时控制图片尺寸,仅此时生效。
|
||||||
|
|
||||||
|
### cpu_offload:
|
||||||
|
|
||||||
|
- 如果是多轮生成,能大幅提速。但是需要在最后不再需要这个节点时且还有后续其他流程,最后关掉这个选项跑一次,来释放转移到cpu上的模型。如果仅生成一次,不要开这个选项。
|
||||||
|
|
||||||
|
## 鸣谢:
|
||||||
|
### [Fork Repo: MaletteAI/anytext](https://github.com/MaletteAI/anytext)
|
||||||
|
- V2构建本地管线思路的来源。
|
||||||
|
### [Official Repo: tyxsspa/AnyText](https://github.com/tyxsspa/AnyText)
|
||||||
|
|
||||||
|
```
|
||||||
|
@article{tuo2023anytext,
|
||||||
|
title={AnyText: Multilingual Visual Text Generation And Editing},
|
||||||
|
author={Yuxiang Tuo and Wangmeng Xiang and Jun-Yan He and Yifeng Geng and Xuansong Xie},
|
||||||
|
year={2023},
|
||||||
|
eprint={2311.03054},
|
||||||
|
archivePrefix={arXiv},
|
||||||
|
primaryClass={cs.CV}
|
||||||
|
}
|
||||||
|
```
|
BIN
AnyText/assets/clip_model.jpg
Normal file
After Width: | Height: | Size: 22 KiB |
BIN
AnyText/assets/zh2en_model.jpg
Normal file
After Width: | Height: | Size: 28 KiB |
BIN
AnyText/example_images/edit12.png
Normal file
After Width: | Height: | Size: 392 KiB |
BIN
AnyText/example_images/edit13.png
Normal file
After Width: | Height: | Size: 480 KiB |
BIN
AnyText/example_images/edit15.png
Normal file
After Width: | Height: | Size: 353 KiB |
BIN
AnyText/example_images/edit16.png
Normal file
After Width: | Height: | Size: 377 KiB |
BIN
AnyText/example_images/edit2.png
Normal file
After Width: | Height: | Size: 710 KiB |
BIN
AnyText/example_images/edit3.png
Normal file
After Width: | Height: | Size: 493 KiB |
BIN
AnyText/example_images/edit5.png
Normal file
After Width: | Height: | Size: 372 KiB |
BIN
AnyText/example_images/ref12.png
Normal file
After Width: | Height: | Size: 444 KiB |
BIN
AnyText/example_images/ref13.jpg
Normal file
After Width: | Height: | Size: 151 KiB |
BIN
AnyText/example_images/ref15.jpeg
Normal file
After Width: | Height: | Size: 185 KiB |
BIN
AnyText/example_images/ref16.jpeg
Normal file
After Width: | Height: | Size: 68 KiB |
BIN
AnyText/example_images/ref2.jpg
Normal file
After Width: | Height: | Size: 150 KiB |
BIN
AnyText/example_images/ref3.jpg
Normal file
After Width: | Height: | Size: 109 KiB |
BIN
AnyText/example_images/ref5.jpg
Normal file
After Width: | Height: | Size: 86 KiB |
99
AnyText/models_yaml/anytext_sd15.yaml
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
model:
|
||||||
|
target: custom_nodes.ComfyUI-AnyText.AnyText.AnyText_scripts.cldm.cldm.ControlLDM
|
||||||
|
params:
|
||||||
|
linear_start: 0.00085
|
||||||
|
linear_end: 0.0120
|
||||||
|
num_timesteps_cond: 1
|
||||||
|
log_every_t: 200
|
||||||
|
timesteps: 1000
|
||||||
|
first_stage_key: "img"
|
||||||
|
cond_stage_key: "caption"
|
||||||
|
control_key: "hint"
|
||||||
|
glyph_key: "glyphs"
|
||||||
|
position_key: "positions"
|
||||||
|
image_size: 64
|
||||||
|
channels: 4
|
||||||
|
cond_stage_trainable: true # need be true when embedding_manager is valid
|
||||||
|
conditioning_key: crossattn
|
||||||
|
monitor: val/loss_simple_ema
|
||||||
|
scale_factor: 0.18215
|
||||||
|
use_ema: False
|
||||||
|
only_mid_control: False
|
||||||
|
loss_alpha: 0 # perceptual loss, 0.003
|
||||||
|
loss_beta: 0 # ctc loss
|
||||||
|
latin_weight: 1.0 # latin text line may need smaller weigth
|
||||||
|
with_step_weight: true
|
||||||
|
use_vae_upsample: true
|
||||||
|
embedding_manager_config:
|
||||||
|
target: custom_nodes.ComfyUI-AnyText.AnyText.AnyText_scripts.cldm.embedding_manager.EmbeddingManager
|
||||||
|
params:
|
||||||
|
valid: true # v6
|
||||||
|
emb_type: ocr # ocr, vit, conv
|
||||||
|
glyph_channels: 1
|
||||||
|
position_channels: 1
|
||||||
|
add_pos: false
|
||||||
|
placeholder_string: '*'
|
||||||
|
|
||||||
|
control_stage_config:
|
||||||
|
target: custom_nodes.ComfyUI-AnyText.AnyText.AnyText_scripts.cldm.cldm.ControlNet
|
||||||
|
params:
|
||||||
|
image_size: 32 # unused
|
||||||
|
in_channels: 4
|
||||||
|
model_channels: 320
|
||||||
|
glyph_channels: 1
|
||||||
|
position_channels: 1
|
||||||
|
attention_resolutions: [ 4, 2, 1 ]
|
||||||
|
num_res_blocks: 2
|
||||||
|
channel_mult: [ 1, 2, 4, 4 ]
|
||||||
|
num_heads: 8
|
||||||
|
use_spatial_transformer: True
|
||||||
|
transformer_depth: 1
|
||||||
|
context_dim: 768
|
||||||
|
use_checkpoint: True
|
||||||
|
legacy: False
|
||||||
|
|
||||||
|
unet_config:
|
||||||
|
target: custom_nodes.ComfyUI-AnyText.AnyText.AnyText_scripts.cldm.cldm.ControlledUnetModel
|
||||||
|
params:
|
||||||
|
image_size: 32 # unused
|
||||||
|
in_channels: 4
|
||||||
|
out_channels: 4
|
||||||
|
model_channels: 320
|
||||||
|
attention_resolutions: [ 4, 2, 1 ]
|
||||||
|
num_res_blocks: 2
|
||||||
|
channel_mult: [ 1, 2, 4, 4 ]
|
||||||
|
num_heads: 8
|
||||||
|
use_spatial_transformer: True
|
||||||
|
transformer_depth: 1
|
||||||
|
context_dim: 768
|
||||||
|
use_checkpoint: True
|
||||||
|
legacy: False
|
||||||
|
|
||||||
|
first_stage_config:
|
||||||
|
target: custom_nodes.ComfyUI-AnyText.AnyText.AnyText_scripts.ldm.models.autoencoder.AutoencoderKL
|
||||||
|
params:
|
||||||
|
embed_dim: 4
|
||||||
|
monitor: val/rec_loss
|
||||||
|
ddconfig:
|
||||||
|
double_z: true
|
||||||
|
z_channels: 4
|
||||||
|
resolution: 256
|
||||||
|
in_channels: 3
|
||||||
|
out_ch: 3
|
||||||
|
ch: 128
|
||||||
|
ch_mult:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 4
|
||||||
|
num_res_blocks: 2
|
||||||
|
attn_resolutions: []
|
||||||
|
dropout: 0.0
|
||||||
|
lossconfig:
|
||||||
|
target: torch.nn.Identity
|
||||||
|
|
||||||
|
cond_stage_config:
|
||||||
|
target: custom_nodes.ComfyUI-AnyText.AnyText.AnyText_scripts.ldm.modules.encoders.modules.FrozenCLIPEmbedderT3
|
||||||
|
params:
|
||||||
|
# version: /home/yuxiang.tyx/.cache/modelscope/hub/damo/cv_anytext_text_generation_editing/clip-vit-large-patch14
|
||||||
|
use_vision: false # v6
|
280
AnyText/nodes.py
Normal file
@ -0,0 +1,280 @@
|
|||||||
|
import os
|
||||||
|
import folder_paths
|
||||||
|
import re
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
from .utils import is_module_imported, pil2tensor, get_device_by_name, comfy_tensor_Image2np_Image
|
||||||
|
|
||||||
|
comfy_temp_dir = folder_paths.get_temp_directory()
|
||||||
|
Random_Gen_Mask_path = os.path.join(comfy_temp_dir, "AnyText_random_mask_pos_img.png")
|
||||||
|
tmp_pose_img_path = os.path.join(comfy_temp_dir, "AnyText_manual_mask_pos_img.png")
|
||||||
|
tmp_ori_img_path = os.path.join(comfy_temp_dir, "AnyText_ori_img.png")
|
||||||
|
|
||||||
|
class AnyText:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.model = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def INPUT_TYPES(cls):
|
||||||
|
return {
|
||||||
|
"required": {
|
||||||
|
"AnyText_Loader": ("AnyText_Loader", {"forceInput": True}),
|
||||||
|
"prompt": ("STRING", {"default": "A raccoon stands in front of the blackboard with the words \"你好呀~Hello!\" written on it.", "multiline": True}),
|
||||||
|
"a_prompt": ("STRING", {"default": "best quality, extremely detailed,4k, HD, supper legible text, clear text edges, clear strokes, neat writing, no watermarks", "multiline": True}),
|
||||||
|
"n_prompt": ("STRING", {"default": "low-res, bad anatomy, extra digit, fewer digits, cropped, worst quality, low quality, watermark, unreadable text, messy words, distorted text, disorganized writing, advertising picture", "multiline": True}),
|
||||||
|
"mode": (['text-generation', 'text-editing'],{"default": 'text-generation'}),
|
||||||
|
"sort_radio": (["↕", "↔"],{"default": "↔"}),
|
||||||
|
"revise_pos": ("BOOLEAN", {"default": False}),
|
||||||
|
"img_count": ("INT", {"default": 1, "min": 1, "max": 10}),
|
||||||
|
"ddim_steps": ("INT", {"default": 20, "min": 2, "max": 100}),
|
||||||
|
"seed": ("INT", {"default": 9999, "min": -1, "max": 99999999}),
|
||||||
|
"nonEdit_random_gen_width": ("INT", {"default": 512, "min": 128, "max": 1920, "step": 64}),
|
||||||
|
"nonEdit_random_gen_height": ("INT", {"default": 512, "min": 128, "max": 1920, "step": 64}),
|
||||||
|
# "width": ("INT", {"forceInput": True}),
|
||||||
|
# "height": ("INT", {"forceInput": True}),
|
||||||
|
"Random_Gen": ("BOOLEAN", {"default": False}),
|
||||||
|
"strength": ("FLOAT", {
|
||||||
|
"default": 1.00,
|
||||||
|
"min": -999999,
|
||||||
|
"max": 9999999,
|
||||||
|
"step": 0.01
|
||||||
|
}),
|
||||||
|
"cfg_scale": ("FLOAT", {
|
||||||
|
"default": 9,
|
||||||
|
"min": 1,
|
||||||
|
"max": 99,
|
||||||
|
"step": 0.1
|
||||||
|
}),
|
||||||
|
"eta": ("FLOAT", {
|
||||||
|
"default": 0,
|
||||||
|
"min": 0,
|
||||||
|
"max": 1,
|
||||||
|
"step": 0.1
|
||||||
|
}),
|
||||||
|
"device": (["auto", "cuda", "cpu", "mps", "xpu"],{"default": "auto"}),
|
||||||
|
"fp16": ("BOOLEAN", {"default": True}),
|
||||||
|
"cpu_offload": ("BOOLEAN", {"default": False, "label_on": "model_to_cpu", "label_off": "unload_model"}),
|
||||||
|
"all_to_device": ("BOOLEAN", {"default": False}),
|
||||||
|
},
|
||||||
|
"optional": {
|
||||||
|
"ori_image": ("IMAGE", {"forceInput": True}),
|
||||||
|
"pos_image": ("IMAGE", {"forceInput": True}),
|
||||||
|
# "show_debug": ("BOOLEAN", {"default": False}),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
RETURN_TYPES = ("IMAGE",)
|
||||||
|
CATEGORY = "ExtraModels/AnyText"
|
||||||
|
FUNCTION = "anytext_process"
|
||||||
|
TITLE = "AnyText Geneation"
|
||||||
|
|
||||||
|
def anytext_process(self,
|
||||||
|
mode,
|
||||||
|
AnyText_Loader,
|
||||||
|
ori_image,
|
||||||
|
pos_image,
|
||||||
|
sort_radio,
|
||||||
|
revise_pos,
|
||||||
|
Random_Gen,
|
||||||
|
prompt,
|
||||||
|
cpu_offload,
|
||||||
|
# show_debug,
|
||||||
|
img_count,
|
||||||
|
fp16,
|
||||||
|
device,
|
||||||
|
all_to_device,
|
||||||
|
ddim_steps=20,
|
||||||
|
strength=1,
|
||||||
|
cfg_scale=9,
|
||||||
|
seed="",
|
||||||
|
eta=0.0,
|
||||||
|
a_prompt="",
|
||||||
|
n_prompt="",
|
||||||
|
nonEdit_random_gen_width=512,
|
||||||
|
nonEdit_random_gen_height=512,
|
||||||
|
):
|
||||||
|
def prompt_replace(prompt):
|
||||||
|
prompt = prompt.replace('“', '"')
|
||||||
|
prompt = prompt.replace('”', '"')
|
||||||
|
p = '"(.*?)"'
|
||||||
|
strs = re.findall(p, prompt)
|
||||||
|
if len(strs) == 0:
|
||||||
|
strs = [' ']
|
||||||
|
else:
|
||||||
|
for s in strs:
|
||||||
|
prompt = prompt.replace(f'"{s}"', f' * ', 1)
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
def check_overlap_polygon(rect_pts1, rect_pts2):
|
||||||
|
poly1 = cv2.convexHull(rect_pts1)
|
||||||
|
poly2 = cv2.convexHull(rect_pts2)
|
||||||
|
rect1 = cv2.boundingRect(poly1)
|
||||||
|
rect2 = cv2.boundingRect(poly2)
|
||||||
|
if rect1[0] + rect1[2] >= rect2[0] and rect2[0] + rect2[2] >= rect1[0] and rect1[1] + rect1[3] >= rect2[1] and rect2[1] + rect2[3] >= rect1[1]:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def count_lines(prompt):
|
||||||
|
prompt = prompt.replace('“', '"')
|
||||||
|
prompt = prompt.replace('”', '"')
|
||||||
|
p = '"(.*?)"'
|
||||||
|
strs = re.findall(p, prompt)
|
||||||
|
if len(strs) == 0:
|
||||||
|
strs = [' ']
|
||||||
|
return len(strs)
|
||||||
|
|
||||||
|
def generate_rectangles(w, h, n, max_trys=200):
|
||||||
|
img = np.zeros((h, w, 1), dtype=np.uint8)
|
||||||
|
rectangles = []
|
||||||
|
attempts = 0
|
||||||
|
n_pass = 0
|
||||||
|
low_edge = int(max(w, h)*0.3 if n <= 3 else max(w, h)*0.2) # ~150, ~100
|
||||||
|
while attempts < max_trys:
|
||||||
|
rect_w = min(np.random.randint(max((w*0.5)//n, low_edge), w), int(w*0.8))
|
||||||
|
ratio = np.random.uniform(4, 10)
|
||||||
|
rect_h = max(low_edge, int(rect_w/ratio))
|
||||||
|
rect_h = min(rect_h, int(h*0.8))
|
||||||
|
# gen rotate angle
|
||||||
|
rotation_angle = 0
|
||||||
|
rand_value = np.random.rand()
|
||||||
|
if rand_value < 0.7:
|
||||||
|
pass
|
||||||
|
elif rand_value < 0.8:
|
||||||
|
rotation_angle = np.random.randint(0, 40)
|
||||||
|
elif rand_value < 0.9:
|
||||||
|
rotation_angle = np.random.randint(140, 180)
|
||||||
|
else:
|
||||||
|
rotation_angle = np.random.randint(85, 95)
|
||||||
|
# rand position
|
||||||
|
x = np.random.randint(0, w - rect_w)
|
||||||
|
y = np.random.randint(0, h - rect_h)
|
||||||
|
# get vertex
|
||||||
|
rect_pts = cv2.boxPoints(((rect_w/2, rect_h/2), (rect_w, rect_h), rotation_angle))
|
||||||
|
rect_pts = np.int32(rect_pts)
|
||||||
|
# move
|
||||||
|
rect_pts += (x, y)
|
||||||
|
# check boarder
|
||||||
|
if np.any(rect_pts < 0) or np.any(rect_pts[:, 0] >= w) or np.any(rect_pts[:, 1] >= h):
|
||||||
|
attempts += 1
|
||||||
|
continue
|
||||||
|
# check overlap
|
||||||
|
if any(check_overlap_polygon(rect_pts, rp) for rp in rectangles): # type: ignore
|
||||||
|
attempts += 1
|
||||||
|
continue
|
||||||
|
n_pass += 1
|
||||||
|
img = cv2.fillPoly(img, [rect_pts], 255)
|
||||||
|
cv2.imwrite(Random_Gen_Mask_path, 255-img[..., ::-1])
|
||||||
|
rectangles.append(rect_pts)
|
||||||
|
if n_pass == n:
|
||||||
|
break
|
||||||
|
print("attempts:", attempts)
|
||||||
|
if len(rectangles) != n:
|
||||||
|
raise Exception(f'Failed in auto generate positions after {attempts} attempts, try again!')
|
||||||
|
return img
|
||||||
|
|
||||||
|
if not is_module_imported('AnyText_Pipeline'):
|
||||||
|
from .AnyText_scripts.AnyText_pipeline import AnyText_Pipeline
|
||||||
|
|
||||||
|
#check if prompt is chinese to decide whether to load translator,检测是否为中文提示词,否则不适用翻译。
|
||||||
|
prompt_modify = prompt_replace(prompt)
|
||||||
|
bool_is_chinese = AnyText_Pipeline.is_chinese(self, prompt_modify)
|
||||||
|
|
||||||
|
device = get_device_by_name(device)
|
||||||
|
loader_out = AnyText_Loader.split("|")
|
||||||
|
|
||||||
|
if bool_is_chinese == False:
|
||||||
|
use_translator = False
|
||||||
|
else:
|
||||||
|
use_translator = True
|
||||||
|
if 'damo/nlp_csanmt_translation_zh2en' in loader_out[3]:
|
||||||
|
if not os.access(os.path.join(folder_paths.models_dir, "prompt_generator", "nlp_csanmt_translation_zh2en", "tf_ckpts", "ckpt-0.data-00000-of-00001"), os.F_OK):
|
||||||
|
if not is_module_imported('snapshot_download'):
|
||||||
|
from modelscope.hub.snapshot_download import snapshot_download
|
||||||
|
snapshot_download('damo/nlp_csanmt_translation_zh2en')
|
||||||
|
else:
|
||||||
|
if not os.access(os.path.join(folder_paths.models_dir, "prompt_generator", "models--utrobinmv--t5_translate_en_ru_zh_small_1024", "model.safetensors"), os.F_OK):
|
||||||
|
if not is_module_imported('hg_snapshot_download'):
|
||||||
|
from huggingface_hub import snapshot_download as hg_snapshot_download
|
||||||
|
hg_snapshot_download(repo_id="utrobinmv/t5_translate_en_ru_zh_small_1024")
|
||||||
|
|
||||||
|
pipe = AnyText_Pipeline(ckpt_path=loader_out[1], clip_path=loader_out[2], translator_path=loader_out[3], cfg_path=loader_out[4], use_translator=use_translator, device=device, use_fp16=fp16, all_to_device=all_to_device, loaded_model_tensor=self.model)
|
||||||
|
|
||||||
|
# tensor图片转换为numpy图片
|
||||||
|
pos_image = comfy_tensor_Image2np_Image(self, pos_image)
|
||||||
|
ori_image = comfy_tensor_Image2np_Image(self, ori_image)
|
||||||
|
# 保存转换后的numpy图片到ComfyUI临时文件夹
|
||||||
|
pos_image.save(tmp_pose_img_path)
|
||||||
|
ori_image.save(tmp_ori_img_path)
|
||||||
|
|
||||||
|
ori = tmp_ori_img_path
|
||||||
|
pos = tmp_pose_img_path
|
||||||
|
|
||||||
|
if mode == "text-generation":
|
||||||
|
ori_image = None
|
||||||
|
revise_pos = revise_pos
|
||||||
|
else:
|
||||||
|
revise_pos = False
|
||||||
|
ori_image = ori
|
||||||
|
|
||||||
|
n_lines = count_lines(prompt)
|
||||||
|
if Random_Gen == True:
|
||||||
|
generate_rectangles(nonEdit_random_gen_width, nonEdit_random_gen_height, n_lines, max_trys=500)
|
||||||
|
pos_img = Random_Gen_Mask_path
|
||||||
|
else:
|
||||||
|
pos_img = pos
|
||||||
|
|
||||||
|
# lora_path = r"D:\AI\ComfyUI_windows_portable\ComfyUI\models\loras\ys艺术\sd15_mw_bpch_扁平风格插画v1d1.safetensors"
|
||||||
|
# lora_ratio = 1
|
||||||
|
# lora_path_ratio = str(lora_path)+ " " + str(lora_ratio)
|
||||||
|
# print("\033[93m", lora_path_ratio, "\033[0m")
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"mode": mode,
|
||||||
|
"use_fp16": fp16,
|
||||||
|
"Random_Gen": Random_Gen,
|
||||||
|
"sort_priority": sort_radio,
|
||||||
|
"revise_pos": revise_pos,
|
||||||
|
# "show_debug": show_debug,
|
||||||
|
"image_count": img_count,
|
||||||
|
"ddim_steps": ddim_steps - 1,
|
||||||
|
"image_width": nonEdit_random_gen_width,
|
||||||
|
"image_height": nonEdit_random_gen_height,
|
||||||
|
"strength": strength,
|
||||||
|
"cfg_scale": cfg_scale,
|
||||||
|
"eta": eta,
|
||||||
|
"a_prompt": a_prompt,
|
||||||
|
"n_prompt": n_prompt,
|
||||||
|
# "lora_path_ratio": lora_path_ratio,
|
||||||
|
}
|
||||||
|
input_data = {
|
||||||
|
"prompt": prompt,
|
||||||
|
"seed": seed,
|
||||||
|
"draw_pos": pos_img,
|
||||||
|
"ori_image": ori_image,
|
||||||
|
}
|
||||||
|
# if show_debug ==True:
|
||||||
|
# print(f'\033[93mloader from .util(从.util输入的loader): {AnyText_Loader}, \033[0m\n \
|
||||||
|
# \033[93mloader_out split form loader(分割loader得到4个参数): {loader_out}, \033[0m\n \
|
||||||
|
# \033[93mFont(字体)--loader_out[0]: {loader_out[0]}, \033[0m\n \
|
||||||
|
# \033[93mAnyText Model(AnyText模型)--loader_out[1]: {loader_out[1]}, \033[0m\n \
|
||||||
|
# \033[93mclip model(clip模型)--loader_out[2]: {loader_out[2]}, \033[0m\n \
|
||||||
|
# \033[93mTranslator(翻译模型)--loader_out[3]: {loader_out[3]}, \033[0m\n \
|
||||||
|
# \033[93myaml_file(yaml配置文件): {loader_out[4]}, \033[0m\n) \
|
||||||
|
# \033[93mIs Chinese Input(是否中文输入): {use_translator}, \033[0m\n \
|
||||||
|
# \033[93mNumber of text-content to generate(需要生成的文本数量): {n_lines}, \033[0m\n \
|
||||||
|
# \033[93mpos_image location(遮罩图位置): {pos}, \033[0m\n \
|
||||||
|
# \033[93mori_image location(原图位置): {ori}, \033[0m\n \
|
||||||
|
# \033[93mSort Position(文本生成位置排序): {sort_radio}, \033[0m\n \
|
||||||
|
# \033[93mEnable revise_pos(启用位置修正): {revise_pos}, \033[0m')
|
||||||
|
x_samples, results, rtn_code, rtn_warning, debug_info, self.model = pipe(input_data, font_path=loader_out[0], cpu_offload=cpu_offload, **params)
|
||||||
|
if rtn_code < 0:
|
||||||
|
raise Exception(f"Error in AnyText pipeline: {rtn_warning}")
|
||||||
|
output = pil2tensor(x_samples)
|
||||||
|
print("\n", debug_info)
|
||||||
|
return(output)
|
||||||
|
|
||||||
|
# Node class and display name mappings
|
||||||
|
NODE_CLASS_MAPPINGS = {
|
||||||
|
"AnyText": AnyText,
|
||||||
|
}
|
6623
AnyText/ocr_weights/ppocr_keys_v1.txt
Normal file
BIN
AnyText/ocr_weights/ppv3_rec.pth
Normal file
214
AnyText/utils.py
Normal file
@ -0,0 +1,214 @@
|
|||||||
|
import os
|
||||||
|
import folder_paths
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
current_directory = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
comfyui_models_dir = folder_paths.models_dir
|
||||||
|
comfy_temp_dir = folder_paths.get_temp_directory()
|
||||||
|
temp_txt_path = os.path.join(comfy_temp_dir, "AnyText_temp.txt")
|
||||||
|
|
||||||
|
class AnyText_loader:
|
||||||
|
@classmethod
|
||||||
|
def INPUT_TYPES(cls):
|
||||||
|
font_list = os.listdir(os.path.join(comfyui_models_dir, "fonts"))
|
||||||
|
checkpoints_list = folder_paths.get_filename_list("checkpoints")
|
||||||
|
clip_list = os.listdir(os.path.join(comfyui_models_dir, "clip"))
|
||||||
|
font_list.insert(0, "Auto_DownLoad")
|
||||||
|
checkpoints_list.insert(0, "Auto_DownLoad")
|
||||||
|
clip_list.insert(0, "Auto_DownLoad")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"required": {
|
||||||
|
"font": (font_list, ),
|
||||||
|
"ckpt_name": (checkpoints_list, ),
|
||||||
|
"clip": (clip_list, ),
|
||||||
|
"translator": (["utrobinmv/t5_translate_en_ru_zh_small_1024", "damo/nlp_csanmt_translation_zh2en"],{"default": "utrobinmv/t5_translate_en_ru_zh_small_1024"}),
|
||||||
|
# "show_debug": ("BOOLEAN", {"default": False}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
RETURN_TYPES = ("AnyText_Loader", )
|
||||||
|
RETURN_NAMES = ("AnyText_Loader", )
|
||||||
|
FUNCTION = "AnyText_loader_fn"
|
||||||
|
CATEGORY = "ExtraModels/AnyText"
|
||||||
|
TITLE = "AnyText Loader"
|
||||||
|
|
||||||
|
def AnyText_loader_fn(self,
|
||||||
|
font,
|
||||||
|
ckpt_name,
|
||||||
|
clip,
|
||||||
|
translator,
|
||||||
|
# show_debug
|
||||||
|
):
|
||||||
|
font_path = os.path.join(comfyui_models_dir, "fonts", font)
|
||||||
|
ckpt_path = folder_paths.get_full_path("checkpoints", ckpt_name)
|
||||||
|
cfg_path = os.path.join(current_directory, 'models_yaml', 'anytext_sd15.yaml')
|
||||||
|
if clip != 'Auto_DownLoad':
|
||||||
|
clip_path = os.path.join(comfyui_models_dir, "clip", clip)
|
||||||
|
else:
|
||||||
|
clip_path = clip
|
||||||
|
if translator != 'Auto_DownLoad':
|
||||||
|
translator_path = os.path.join(comfyui_models_dir, "prompt_generator", translator)
|
||||||
|
else:
|
||||||
|
translator_path = translator
|
||||||
|
|
||||||
|
#将输入参数合并到一个参数里面传递到.nodes
|
||||||
|
loader = (font_path + "|" + str(ckpt_path) + "|" + clip_path + "|" + translator_path + "|" + cfg_path)
|
||||||
|
|
||||||
|
# if show_debug == True:
|
||||||
|
# print(f'\033[93mloader(合并后的4个输入参数,传递给nodes): {loader} \033[0m\n \
|
||||||
|
# \033[93mfont_path(字体): {font_path} \033[0m\n \
|
||||||
|
# \033[93mckpt_path(AnyText模型): {ckpt_path} \033[0m\n \
|
||||||
|
# \033[93mclip_path(clip模型): {clip_path} \033[0m\n \
|
||||||
|
# \033[93mtranslator_path(翻译模型): {translator_path} \033[0m\n \
|
||||||
|
# \033[93myaml_file(yaml配置文件): {cfg_path} \033[0m\n')
|
||||||
|
return (loader, )
|
||||||
|
|
||||||
|
class AnyText_translator:
|
||||||
|
@classmethod
|
||||||
|
def INPUT_TYPES(cls):
|
||||||
|
return {
|
||||||
|
"required": {
|
||||||
|
"model": (["utrobinmv/t5_translate_en_ru_zh_small_1024", "damo/nlp_csanmt_translation_zh2en"],{"default": "utrobinmv/t5_translate_en_ru_zh_small_1024"}),
|
||||||
|
"prompt": ("STRING", {"default": "这里是单批次翻译文本输入。\n声明补充说,沃伦的同事都深感震惊,并且希望他能够投案自首。\n尽量输入单句文本,如果是多句长文本建议人工分句,否则可能出现漏译或未译等情况!!!\n使用换行,效果可能更佳。", "multiline": True}),
|
||||||
|
"Batch_prompt": ("STRING", {"default": "这里是多批次翻译文本输入,使用换行进行分割。\n天上掉馅饼啦,快去看超人!!!\n飞流直下三千尺,疑似银河落九天。\n启用Batch_Newline表示输出的翻译会按换行输入进行二次换行,否则是用空格合并起来的整篇文本。", "multiline": True}),
|
||||||
|
"t5_Target_Language": (["en", "zh", "ru", ],{"default": "en"}),
|
||||||
|
"if_Batch": ("BOOLEAN", {"default": False}),
|
||||||
|
"Batch_Newline" :("BOOLEAN", {"default": True}),
|
||||||
|
"device": (["auto", "cuda", "cpu", "mps", "xpu"],{"default": "auto"}),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
RETURN_TYPES = ("STRING",)
|
||||||
|
RETURN_NAMES = ("text",)
|
||||||
|
CATEGORY = "ExtraModels/AnyText"
|
||||||
|
FUNCTION = "AnyText_translator"
|
||||||
|
TITLE = "AnyText Translator"
|
||||||
|
|
||||||
|
def AnyText_translator(self, prompt, model, Batch_prompt, if_Batch, device, Batch_Newline, t5_Target_Language):
|
||||||
|
device = get_device_by_name(device)
|
||||||
|
# 使用换行(\n)作为分隔符
|
||||||
|
Batch_prompt = Batch_prompt.split("\n")
|
||||||
|
input_sequence = prompt
|
||||||
|
if model == 'damo/nlp_csanmt_translation_zh2en':
|
||||||
|
sttime = time.time()
|
||||||
|
if if_Batch == True:
|
||||||
|
input_sequence = Batch_prompt
|
||||||
|
# 用特定的连接符<SENT_SPLIT>,将多个句子进行串联
|
||||||
|
input_sequence = '<SENT_SPLIT>'.join(input_sequence)
|
||||||
|
if os.access(os.path.join(comfyui_models_dir, "prompt_generator", "nlp_csanmt_translation_zh2en", "tf_ckpts", "ckpt-0.data-00000-of-00001"), os.F_OK):
|
||||||
|
zh2en_path = os.path.join(comfyui_models_dir, 'prompt_generator', 'nlp_csanmt_translation_zh2en')
|
||||||
|
else:
|
||||||
|
zh2en_path = "damo/nlp_csanmt_translation_zh2en"
|
||||||
|
|
||||||
|
if not is_module_imported('pipeline'):
|
||||||
|
from modelscope.pipelines import pipeline
|
||||||
|
if not is_module_imported('Tasks'):
|
||||||
|
from modelscope.utils.constant import Tasks
|
||||||
|
if device == 'cuda':
|
||||||
|
pipeline_ins = pipeline(task=Tasks.translation, model=zh2en_path, device='gpu')
|
||||||
|
outputs = pipeline_ins(input=input_sequence)
|
||||||
|
if if_Batch == True:
|
||||||
|
results = outputs['translation'].split('<SENT_SPLIT>')
|
||||||
|
if Batch_Newline == True:
|
||||||
|
results = '\n\n'.join(results)
|
||||||
|
else:
|
||||||
|
results = ' '.join(results)
|
||||||
|
else:
|
||||||
|
results = outputs['translation']
|
||||||
|
endtime = time.time()
|
||||||
|
print("\033[93mTime for translating(翻译耗时): ", endtime - sttime, "\033[0m")
|
||||||
|
del pipeline_ins
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
else:
|
||||||
|
if if_Batch == True:
|
||||||
|
input_sequence = Batch_prompt
|
||||||
|
# 用特定的连接符<SENT_SPLIT>,将多个句子进行串联
|
||||||
|
input_sequence = '|'.join(input_sequence)
|
||||||
|
self.zh2en_path = os.path.join(folder_paths.models_dir, "prompt_generator", "models--utrobinmv--t5_translate_en_ru_zh_small_1024")
|
||||||
|
if not os.access(os.path.join(self.zh2en_path, "model.safetensors"), os.F_OK):
|
||||||
|
self.zh2en_path = "utrobinmv/t5_translate_en_ru_zh_small_1024"
|
||||||
|
outputs = t5_translate_en_ru_zh(t5_Target_Language, input_sequence, self.zh2en_path, device)[0]
|
||||||
|
if if_Batch == True:
|
||||||
|
results = outputs.split('| ')
|
||||||
|
if Batch_Newline == True:
|
||||||
|
results = '\n\n'.join(results)
|
||||||
|
else:
|
||||||
|
results = ' '.join(results)
|
||||||
|
else:
|
||||||
|
results = outputs
|
||||||
|
|
||||||
|
with open(temp_txt_path, "w", encoding="UTF-8") as text_file:
|
||||||
|
text_file.write(results)
|
||||||
|
return (results, )
|
||||||
|
|
||||||
|
def is_module_imported(module_name):
|
||||||
|
try:
|
||||||
|
__import__(module_name)
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def pil2tensor(image):
|
||||||
|
return torch.from_numpy(np.array(image).astype(np.float32) / 255.0).unsqueeze(0)
|
||||||
|
|
||||||
|
def is_folder_exist(folder_path):
|
||||||
|
result = os.path.exists(folder_path)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_device_by_name(device):
|
||||||
|
if device == 'auto':
|
||||||
|
try:
|
||||||
|
device = "cpu"
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
device = "cuda"
|
||||||
|
elif torch.backends.mps.is_available():
|
||||||
|
device = "mps"
|
||||||
|
elif torch.xpu.is_available():
|
||||||
|
device = "xpu"
|
||||||
|
except:
|
||||||
|
raise AttributeError("What's your device(到底用什么设备跑的)?")
|
||||||
|
print("\033[93mUse Device(使用设备):", device, "\033[0m")
|
||||||
|
return device
|
||||||
|
|
||||||
|
# Node class and display name mappings
|
||||||
|
NODE_CLASS_MAPPINGS = {
|
||||||
|
"AnyText_loader": AnyText_loader,
|
||||||
|
"AnyText_translator": AnyText_translator,
|
||||||
|
}
|
||||||
|
|
||||||
|
def t5_translate_en_ru_zh(Target_Language, prompt, model_path, device):
|
||||||
|
|
||||||
|
# prefix = 'translate to en: '
|
||||||
|
sttime = time.time()
|
||||||
|
if not is_module_imported('T5ForConditionalGeneration'):
|
||||||
|
from transformers import T5ForConditionalGeneration
|
||||||
|
if not is_module_imported('T5Tokenizer'):
|
||||||
|
from transformers import T5Tokenizer
|
||||||
|
model = T5ForConditionalGeneration.from_pretrained(model_path,)
|
||||||
|
tokenizer = T5Tokenizer.from_pretrained(model_path)
|
||||||
|
if Target_Language == 'zh':
|
||||||
|
prefix = 'translate to zh: '
|
||||||
|
elif Target_Language == 'en':
|
||||||
|
prefix = 'translate to en: '
|
||||||
|
else:
|
||||||
|
prefix = 'translate to ru: '
|
||||||
|
src_text = prefix + prompt
|
||||||
|
input_ids = tokenizer(src_text, return_tensors="pt")
|
||||||
|
generated_tokens = model.generate(**input_ids).to(device, torch.float32)
|
||||||
|
result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
||||||
|
model.to('cpu')
|
||||||
|
endtime = time.time()
|
||||||
|
print("\033[93mTime for translating(翻译耗时): ", endtime - sttime, "\033[0m")
|
||||||
|
return result
|
||||||
|
|
||||||
|
def comfy_tensor_Image2np_Image(self,comfy_tensor_image):
|
||||||
|
comfyimage = comfy_tensor_image.numpy()[0] * 255
|
||||||
|
image_np = comfyimage.astype(np.uint8)
|
||||||
|
image = Image.fromarray(image_np)
|
||||||
|
return image
|
101
README.md
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
# Unofficial Simple And Rough Implementation Of AnyText | [中文说明](./AnyText/assets/README-Zh-CN.md)
|
||||||
|
|
||||||
|
## Original Repo: [AnyText: Multilingual Visual Text Generation And Editing](https://github.com/tyxsspa/AnyText)
|
||||||
|
|
||||||
|
## For personal reason Suspended maintenance.
|
||||||
|
|
||||||
|
## Warning:
|
||||||
|
- I'm not a coder, so many issues i have no idea how to solve.
|
||||||
|
- **Do not install modelscope & tensorflow packages if `damo/nlp_csanmt_translation_zh2en` translator not needed!!!**
|
||||||
|
- This custom-node results maybe worse than official.
|
||||||
|
- Tested only on **cuda with fp16/fp32** , you can try others options but maybe not work.
|
||||||
|
- Tested with **Official_ComfyUI_Stable_Release** using **python_embed** on **windows** in my case. Distributions from unofficial or vitural env or other OS(such as linux) maybe not work.
|
||||||
|
- Tensorflow need specified cuda_version to run on gpu, but on native windows [tensorflow 2.10+: look at the note](https://github.com/tensorflow/tensorflow/releases/tag/v2.11.1) will not work on cuda, we need linux or wsl2 to make gpu work. In this case, `damo/nlp_csanmt_translation_zh2en` translator will run slowly on cpu.
|
||||||
|
- If error `Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same` raise, try set **all_to_device** to true, maybe works. Thanks to **@[602387193c](https://github.com/602387193c)**----->**[issues/17](https://github.com/zmwv823/ComfyUI-AnyText/issues/17)**.
|
||||||
|
- If error `expected scalar type Half but found Float`, try fp32.
|
||||||
|
### v2 test, more native, not remote_code mode.
|
||||||
|
|
||||||
|
## Instructions:
|
||||||
|
- `utrobinmv/t5_translate_en_ru_zh_small_1024` (212MB) is faster and smaller, but accurancy is far worse than `damo/nlp_csanmt_translation_zh2en`(7.3GB).
|
||||||
|
- Input_prompts will be checked if is_Chinese_prompts to decide whether auto load translator or not.
|
||||||
|
- Numbers of draw_masks must >= nunbers of string_content (in the "") we want to generate, or it will raise an error ["not enough values to unpack"](https://github.com/zmwv823/ComfyUI-AnyText/issues/7).
|
||||||
|
- works on my pc: ComfyUI official release+(ComfyUI_windows_portable\ComfyUI)start with powershell+python_embed+win10+py311+torch2.3.0+cu121+rtx3050laptop(4GB).
|
||||||
|
- pillow>=9.5.0(10.3.0) Most packages are the newest.
|
||||||
|
- **Accept any resolution image input, but will resized to <=768, output images will limited to <=768.(Official method)**
|
||||||
|
- **If font、ckpt_name、clip、translator set to Auto_DownLoad, default models will automtically download to specified directory. Models will loaded if models already exist.**
|
||||||
|
- AnyText model will download into "ComfyUI\models\checkpoints\15\anytext_v1.1.safetensors" from huggingface(fp16: 2.66 GB).
|
||||||
|
- We can manually download [AnyText-FP32-5.73 GB](https://modelscope.cn/models/iic/cv_anytext_text_generation_editing/file/view/master?fileName=anytext_v1.1.ckpt&status=2) from modelscope,(fp32 5.73 GB).Then put it into **ComfyUI\models\checkpoints**.
|
||||||
|
- Or manually download [AnyText-FP16-2.66 GB](https://huggingface.co/Sanster/AnyText/blob/main/pytorch_model.fp16.safetensors) from huggingface and rename it to **anytext_v1.1.safetensors or whatever you like**.Then put it into **ComfyUI\models\checkpoints**.
|
||||||
|
- clip model [**clip-vit-large-patch14**](https://huggingface.co/openai/clip-vit-large-patch14) will download into `C:\Users\username\.cache\huggingface\hub`. We can manually download all files from [clip_model](https://huggingface.co/openai/clip-vit-large-patch14) into **ComfyUI\models\clip\openai--clip-vit-large-patch14**.
|
||||||
|
- ![](./AnyText/assets/clip_model.jpg)
|
||||||
|
- [Font-(SourceHanSansSC-Medium.otf)-18MB](https://huggingface.co/Sanster/AnyText/blob/main/SourceHanSansSC-Medium.otf) will download into **ComfyUI\models\fonts** from huggingface, we can use any other fonts too.
|
||||||
|
- Translator model [huggingface--utrobinmv/t5_translate_en_ru_zh_small_1024-212MB](https://huggingface.co/utrobinmv/t5_translate_en_ru_zh_small_1024) will download into `C:\Users\username\.cache\huggingface\hub` or [modelscope--damo\nlp_csanmt_translation_zh2en--7.3GB](https://www.modelscope.cn/models/iic/nlp_csanmt_translation_zh2en) will download into `C:\Users\username\.cache\modelscope\hub\damo`. We can maually download translator model from link before, then put all files into `ComfyUI\models\prompt_generator\models--utrobinmv--t5_translate_en_ru_zh_small_1024` or `ComfyUI\models\prompt_generator\nlp_csanmt_translation_zh2en`.
|
||||||
|
- ![](./AnyText/assets/zh2en_model.jpg)
|
||||||
|
- **The AnyText model itself is also a standard sd1.5 text2image model.**
|
||||||
|
## Example Prompts:
|
||||||
|
### Text-Generation English Prompts:
|
||||||
|
- An exquisite mug with an ancient Chinese poem engraved on it, including “花落知多少” and “夜来风雨声” and “处处闻啼鸟” and “春眠不觉晓”
|
||||||
|
- Sign on the clean building that reads “科学” and "과학" and "ステップ" and "SCIENCE"
|
||||||
|
- An ice sculpture is made with the text "Happy" and "Holidays".Dslr photo.
|
||||||
|
- A baseball cap with words “要聪明地” and “全力以赴”
|
||||||
|
- A nice drawing of octopus, sharks, and boats made by a child with crayons, with the words “神奇海底世界”
|
||||||
|
### Text-Editing English Prompts:
|
||||||
|
- A Minion meme that says "wrong"
|
||||||
|
- A pile of fruit with "UIT" written in the middle
|
||||||
|
- photo of clean sandy beach," " " "
|
||||||
|
### Text-Generation Chinese Prompts:
|
||||||
|
- 一个儿童蜡笔画,森林里有一个可爱的蘑菇形状的房子,标题是"森林小屋"
|
||||||
|
- 一个精美设计的logo,画的是一个黑白风格的厨师,带着厨师帽,logo下方写着“深夜食堂”
|
||||||
|
- 一张户外雪地靴的电商广告,上面写着 “双12大促!”,“立减50”,“加绒加厚”,“穿脱方便”,“温暖24小时送达”, “包邮”,高级设计感,精美构图
|
||||||
|
- 一个精致的马克杯,上面雕刻着一首中国古诗,内容是 "花落知多少" "夜来风雨声" "处处闻啼鸟" "春眠不觉晓"
|
||||||
|
- 一个漂亮的蜡笔画,有行星,宇航员,还有宇宙飞船,上面写的是"去火星旅行", "王小明", "11月1日"
|
||||||
|
- 一个装饰华丽的蛋糕,上面用奶油写着“阿里云”和"APSARA"
|
||||||
|
- 一张关于墙上的彩色涂鸦艺术的摄影作品,上面写着“人工智能" 和 "神经网络"
|
||||||
|
- 一枚中国古代铜钱, 上面的文字是 "康" "寶" "通" "熙"
|
||||||
|
- 精美的书法作品,上面写着“志” “存” “高” “远”
|
||||||
|
### Text-Editing Chinese Prompts:
|
||||||
|
- 一个表情包,小猪说 "下班"
|
||||||
|
- 一个中国古代铜钱,上面写着"乾" "隆"
|
||||||
|
- 一个黄色标志牌,上边写着"不要" 和 "大意"
|
||||||
|
- 一个建筑物前面的字母标牌, 上面写着 " "
|
||||||
|
## Example workflow:
|
||||||
|
![workflow](./AnyText/assets/AnyText-wf.png)
|
||||||
|
|
||||||
|
## Some Params:
|
||||||
|
|
||||||
|
### sort_radio: order to draw text.
|
||||||
|
|
||||||
|
- ↕ for y axis. It will draw text-content("string") from start-to-end(order) on the mask position from top to bottom.
|
||||||
|
- ↔ for x axis .It will draw text-content("string") from start-to-end(order) on the mask position from left to right.
|
||||||
|
|
||||||
|
### revise_pose: correct text position(only works in gen-mode).
|
||||||
|
|
||||||
|
- Which uses the bounding box of the rendered text as the revised position. However, it is occasionally found that the creativity of the generated text is slightly lower using this method, It dosen't work in text-edit mode.
|
||||||
|
|
||||||
|
### Random_Gen: automatic generate mask.
|
||||||
|
|
||||||
|
- Automatically generate mask based on the number of text-content("string"). With this checked the manual_draw mask dosen't work.
|
||||||
|
|
||||||
|
### nonEdit_random_gen_width & nonEdit_random_gen_height:
|
||||||
|
|
||||||
|
- For image size control with **text-generation and Random_Gen** together, works only in this situation.
|
||||||
|
|
||||||
|
### cpu_offload:
|
||||||
|
|
||||||
|
- For multi-turn generation, it will speed up a lot. But we need to turn it off and run once when this node is no more needed and with other process for deleting model from cpu(ram). If single generation, just turn it off.
|
||||||
|
|
||||||
|
## Citation:
|
||||||
|
### [Fork Repo: MaletteAI/anytext](https://github.com/MaletteAI/anytext)
|
||||||
|
- V2 build native pipeline method inspired by it.
|
||||||
|
### [Official Repo: tyxsspa/AnyText](https://github.com/tyxsspa/AnyText)
|
||||||
|
|
||||||
|
```
|
||||||
|
@article{tuo2023anytext,
|
||||||
|
title={AnyText: Multilingual Visual Text Generation And Editing},
|
||||||
|
author={Yuxiang Tuo and Wangmeng Xiang and Jun-Yan He and Yifeng Geng and Xuansong Xie},
|
||||||
|
year={2023},
|
||||||
|
eprint={2311.03054},
|
||||||
|
archivePrefix={arXiv},
|
||||||
|
primaryClass={cs.CV}
|
||||||
|
}
|
||||||
|
```
|
33
__init__.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
from .AnyText.utils import is_folder_exist
|
||||||
|
import folder_paths
|
||||||
|
import os
|
||||||
|
|
||||||
|
#加载插件前先检查是否在os.listdir里存在自定义目录,没有则自动创建,防止加载节点失败,官方目录可无视。
|
||||||
|
fonts_path = os.path.join(folder_paths.models_dir, 'fonts')
|
||||||
|
translator_path = os.path.join(folder_paths.models_dir, 'prompt_generator')
|
||||||
|
comfy_temp_dir = folder_paths.get_temp_directory()
|
||||||
|
if not is_folder_exist(fonts_path):
|
||||||
|
os.makedirs(fonts_path)
|
||||||
|
if not is_folder_exist(translator_path):
|
||||||
|
os.makedirs(translator_path)
|
||||||
|
if not is_folder_exist(comfy_temp_dir):
|
||||||
|
os.makedirs(comfy_temp_dir)
|
||||||
|
|
||||||
|
# only import if running as a custom node
|
||||||
|
try:
|
||||||
|
pass
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
NODE_CLASS_MAPPINGS = {}
|
||||||
|
|
||||||
|
# AnyText
|
||||||
|
from .AnyText.nodes import NODE_CLASS_MAPPINGS as AnyText_Nodes
|
||||||
|
NODE_CLASS_MAPPINGS.update(AnyText_Nodes)
|
||||||
|
|
||||||
|
# AnyText_utils
|
||||||
|
from .AnyText.utils import NODE_CLASS_MAPPINGS as AnyText_loader_Nodes
|
||||||
|
NODE_CLASS_MAPPINGS.update(AnyText_loader_Nodes)
|
||||||
|
|
||||||
|
NODE_DISPLAY_NAME_MAPPINGS = {k:v.TITLE for k,v in NODE_CLASS_MAPPINGS.items()}
|
||||||
|
__all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']
|
11
requirements-without-nlp-translator.txt
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
pillow
|
||||||
|
numpy<=1.26.4
|
||||||
|
torchvision
|
||||||
|
opencv-python
|
||||||
|
transformers
|
||||||
|
accelerate
|
||||||
|
einops
|
||||||
|
huggingface_hub
|
||||||
|
pytorch_lightning
|
||||||
|
torch
|
||||||
|
ujson
|
15
requirements.txt
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
pillow
|
||||||
|
numpy<=1.26.4
|
||||||
|
torchvision
|
||||||
|
opencv-python
|
||||||
|
transformers
|
||||||
|
accelerate
|
||||||
|
einops
|
||||||
|
huggingface_hub
|
||||||
|
pytorch_lightning
|
||||||
|
torch
|
||||||
|
ujson
|
||||||
|
|
||||||
|
#nlp translator
|
||||||
|
modelscope
|
||||||
|
tensorflow
|
214
tree.py
Normal file
@ -0,0 +1,214 @@
|
|||||||
|
F:.
|
||||||
|
│ .gitattributes
|
||||||
|
│ .gitignore
|
||||||
|
│ README.md
|
||||||
|
│ requirements-without-translator.txt
|
||||||
|
│ requirements.txt
|
||||||
|
│ __init__.py
|
||||||
|
│
|
||||||
|
├─AnyText
|
||||||
|
│ │ nodes.py
|
||||||
|
│ │ utils.py
|
||||||
|
│ │
|
||||||
|
│ ├─AnyText_scripts
|
||||||
|
│ │ │ AnyText_bert_tokenizer.py
|
||||||
|
│ │ │ AnyText_dataset_util.py
|
||||||
|
│ │ │ AnyText_pipeline.py
|
||||||
|
│ │ │ AnyText_pipeline_util.py
|
||||||
|
│ │ │ AnyText_t3_dataset.py
|
||||||
|
│ │ │
|
||||||
|
│ │ ├─cldm
|
||||||
|
│ │ │ │ cldm.py
|
||||||
|
│ │ │ │ ddim_hacked.py
|
||||||
|
│ │ │ │ embedding_manager.py
|
||||||
|
│ │ │ │ hack.py
|
||||||
|
│ │ │ │ logger.py
|
||||||
|
│ │ │ │ model.py
|
||||||
|
│ │ │ │ recognizer.py
|
||||||
|
│ │ │ │
|
||||||
|
│ │ │ ├─ocr_recog
|
||||||
|
│ │ │ │ │ common.py
|
||||||
|
│ │ │ │ │ en_dict.txt
|
||||||
|
│ │ │ │ │ ppocr_keys_v1.txt
|
||||||
|
│ │ │ │ │ RecCTCHead.py
|
||||||
|
│ │ │ │ │ RecModel.py
|
||||||
|
│ │ │ │ │ RecMv1_enhance.py
|
||||||
|
│ │ │ │ │ RecSVTR.py
|
||||||
|
│ │ │ │ │ RNN.py
|
||||||
|
│ │ │ │ │
|
||||||
|
│ │ │ │ └─__pycache__
|
||||||
|
│ │ │ │ common.cpython-311.pyc
|
||||||
|
│ │ │ │ RecCTCHead.cpython-311.pyc
|
||||||
|
│ │ │ │ RecModel.cpython-311.pyc
|
||||||
|
│ │ │ │ RecMv1_enhance.cpython-311.pyc
|
||||||
|
│ │ │ │ RecSVTR.cpython-311.pyc
|
||||||
|
│ │ │ │ RNN.cpython-311.pyc
|
||||||
|
│ │ │ │
|
||||||
|
│ │ │ └─__pycache__
|
||||||
|
│ │ │ cldm.cpython-311.pyc
|
||||||
|
│ │ │ ddim_hacked.cpython-311.pyc
|
||||||
|
│ │ │ embedding_manager.cpython-311.pyc
|
||||||
|
│ │ │ model.cpython-311.pyc
|
||||||
|
│ │ │ recognizer.cpython-311.pyc
|
||||||
|
│ │ │
|
||||||
|
│ │ ├─ldm
|
||||||
|
│ │ │ │ util.py
|
||||||
|
│ │ │ │
|
||||||
|
│ │ │ ├─data
|
||||||
|
│ │ │ │ util.py
|
||||||
|
│ │ │ │ __init__.py
|
||||||
|
│ │ │ │
|
||||||
|
│ │ │ ├─models
|
||||||
|
│ │ │ │ │ autoencoder.py
|
||||||
|
│ │ │ │ │
|
||||||
|
│ │ │ │ ├─diffusion
|
||||||
|
│ │ │ │ │ │ ddim.py
|
||||||
|
│ │ │ │ │ │ ddpm.py
|
||||||
|
│ │ │ │ │ │ plms.py
|
||||||
|
│ │ │ │ │ │ recognizer.py
|
||||||
|
│ │ │ │ │ │ sampling_util.py
|
||||||
|
│ │ │ │ │ │ __init__.py
|
||||||
|
│ │ │ │ │ │
|
||||||
|
│ │ │ │ │ ├─dpm_solver
|
||||||
|
│ │ │ │ │ │ dpm_solver.py
|
||||||
|
│ │ │ │ │ │ sampler.py
|
||||||
|
│ │ │ │ │ │ __init__.py
|
||||||
|
│ │ │ │ │ │
|
||||||
|
│ │ │ │ │ ├─ocr_recog
|
||||||
|
│ │ │ │ │ │ │ common.py
|
||||||
|
│ │ │ │ │ │ │ en_dict.txt
|
||||||
|
│ │ │ │ │ │ │ ppocr_keys_v1.txt
|
||||||
|
│ │ │ │ │ │ │ RecCTCHead.py
|
||||||
|
│ │ │ │ │ │ │ RecModel.py
|
||||||
|
│ │ │ │ │ │ │ RecMv1_enhance.py
|
||||||
|
│ │ │ │ │ │ │ RecSVTR.py
|
||||||
|
│ │ │ │ │ │ │ RNN.py
|
||||||
|
│ │ │ │ │ │ │
|
||||||
|
│ │ │ │ │ │ └─__pycache__
|
||||||
|
│ │ │ │ │ │ common.cpython-311.pyc
|
||||||
|
│ │ │ │ │ │ RecCTCHead.cpython-311.pyc
|
||||||
|
│ │ │ │ │ │ RecModel.cpython-311.pyc
|
||||||
|
│ │ │ │ │ │ RecMv1_enhance.cpython-311.pyc
|
||||||
|
│ │ │ │ │ │ RecSVTR.cpython-311.pyc
|
||||||
|
│ │ │ │ │ │ RNN.cpython-311.pyc
|
||||||
|
│ │ │ │ │ │
|
||||||
|
│ │ │ │ │ └─__pycache__
|
||||||
|
│ │ │ │ │ ddim.cpython-311.pyc
|
||||||
|
│ │ │ │ │ ddpm.cpython-311.pyc
|
||||||
|
│ │ │ │ │ recognizer.cpython-311.pyc
|
||||||
|
│ │ │ │ │ __init__.cpython-311.pyc
|
||||||
|
│ │ │ │ │
|
||||||
|
│ │ │ │ └─__pycache__
|
||||||
|
│ │ │ │ autoencoder.cpython-311.pyc
|
||||||
|
│ │ │ │
|
||||||
|
│ │ │ ├─modules
|
||||||
|
│ │ │ │ │ attention.py
|
||||||
|
│ │ │ │ │ ema.py
|
||||||
|
│ │ │ │ │
|
||||||
|
│ │ │ │ ├─diffusionmodules
|
||||||
|
│ │ │ │ │ │ model.py
|
||||||
|
│ │ │ │ │ │ openaimodel.py
|
||||||
|
│ │ │ │ │ │ upscaling.py
|
||||||
|
│ │ │ │ │ │ util.py
|
||||||
|
│ │ │ │ │ │ __init__.py
|
||||||
|
│ │ │ │ │ │
|
||||||
|
│ │ │ │ │ └─__pycache__
|
||||||
|
│ │ │ │ │ model.cpython-311.pyc
|
||||||
|
│ │ │ │ │ openaimodel.cpython-311.pyc
|
||||||
|
│ │ │ │ │ util.cpython-311.pyc
|
||||||
|
│ │ │ │ │ __init__.cpython-311.pyc
|
||||||
|
│ │ │ │ │
|
||||||
|
│ │ │ │ ├─distributions
|
||||||
|
│ │ │ │ │ │ distributions.py
|
||||||
|
│ │ │ │ │ │ __init__.py
|
||||||
|
│ │ │ │ │ │
|
||||||
|
│ │ │ │ │ └─__pycache__
|
||||||
|
│ │ │ │ │ distributions.cpython-311.pyc
|
||||||
|
│ │ │ │ │ __init__.cpython-311.pyc
|
||||||
|
│ │ │ │ │
|
||||||
|
│ │ │ │ ├─encoders
|
||||||
|
│ │ │ │ │ │ modules.py
|
||||||
|
│ │ │ │ │ │ __init__.py
|
||||||
|
│ │ │ │ │ │
|
||||||
|
│ │ │ │ │ └─__pycache__
|
||||||
|
│ │ │ │ │ modules.cpython-311.pyc
|
||||||
|
│ │ │ │ │ __init__.cpython-311.pyc
|
||||||
|
│ │ │ │ │
|
||||||
|
│ │ │ │ ├─image_degradation
|
||||||
|
│ │ │ │ │ │ bsrgan.py
|
||||||
|
│ │ │ │ │ │ bsrgan_light.py
|
||||||
|
│ │ │ │ │ │ utils_image.py
|
||||||
|
│ │ │ │ │ │ __init__.py
|
||||||
|
│ │ │ │ │ │
|
||||||
|
│ │ │ │ │ └─utils
|
||||||
|
│ │ │ │ │ test.png
|
||||||
|
│ │ │ │ │
|
||||||
|
│ │ │ │ ├─midas
|
||||||
|
│ │ │ │ │ │ api.py
|
||||||
|
│ │ │ │ │ │ utils.py
|
||||||
|
│ │ │ │ │ │ __init__.py
|
||||||
|
│ │ │ │ │ │
|
||||||
|
│ │ │ │ │ └─midas
|
||||||
|
│ │ │ │ │ base_model.py
|
||||||
|
│ │ │ │ │ blocks.py
|
||||||
|
│ │ │ │ │ dpt_depth.py
|
||||||
|
│ │ │ │ │ midas_net.py
|
||||||
|
│ │ │ │ │ midas_net_custom.py
|
||||||
|
│ │ │ │ │ transforms.py
|
||||||
|
│ │ │ │ │ vit.py
|
||||||
|
│ │ │ │ │ __init__.py
|
||||||
|
│ │ │ │ │
|
||||||
|
│ │ │ │ └─__pycache__
|
||||||
|
│ │ │ │ attention.cpython-311.pyc
|
||||||
|
│ │ │ │ ema.cpython-311.pyc
|
||||||
|
│ │ │ │
|
||||||
|
│ │ │ └─__pycache__
|
||||||
|
│ │ │ util.cpython-311.pyc
|
||||||
|
│ │ │
|
||||||
|
│ │ └─__pycache__
|
||||||
|
│ │ AnyText_bert_tokenizer.cpython-311.pyc
|
||||||
|
│ │ AnyText_dataset_util.cpython-311.pyc
|
||||||
|
│ │ AnyText_pipeline.cpython-311.pyc
|
||||||
|
│ │ AnyText_pipeline_util.cpython-311.pyc
|
||||||
|
│ │ AnyText_t3_dataset.cpython-311.pyc
|
||||||
|
│ │
|
||||||
|
│ ├─assets
|
||||||
|
│ │ AnyText-wf.png
|
||||||
|
│ │ clip_model.jpg
|
||||||
|
│ │ README-Zh-CN.md
|
||||||
|
│ │ zh2en_model.jpg
|
||||||
|
│ │
|
||||||
|
│ ├─example_images
|
||||||
|
│ │ edit12.png
|
||||||
|
│ │ edit13.png
|
||||||
|
│ │ edit15.png
|
||||||
|
│ │ edit16.png
|
||||||
|
│ │ edit2.png
|
||||||
|
│ │ edit3.png
|
||||||
|
│ │ edit5.png
|
||||||
|
│ │ ref12.png
|
||||||
|
│ │ ref13.jpg
|
||||||
|
│ │ ref15.jpeg
|
||||||
|
│ │ ref16.jpeg
|
||||||
|
│ │ ref2.jpg
|
||||||
|
│ │ ref3.jpg
|
||||||
|
│ │ ref5.jpg
|
||||||
|
│ │
|
||||||
|
│ ├─models_yaml
|
||||||
|
│ │ anytext_sd15.yaml
|
||||||
|
│ │
|
||||||
|
│ ├─ocr_weights
|
||||||
|
│ │ ppocr_keys_v1.txt
|
||||||
|
│ │ ppv3_rec.pth
|
||||||
|
│ │
|
||||||
|
│ ├─temp_dir
|
||||||
|
│ │ AnyText_manual_mask_pos_img.png
|
||||||
|
│ │ AnyText_random_mask_pos_img.png
|
||||||
|
│ │ AnyText_temp.txt
|
||||||
|
│ │
|
||||||
|
│ └─__pycache__
|
||||||
|
│ nodes.cpython-311.pyc
|
||||||
|
│ utils.cpython-311.pyc
|
||||||
|
│
|
||||||
|
└─__pycache__
|
||||||
|
__init__.cpython-311.pyc
|