Support LLaVA v1.5 (#4305)

2023-10-20 00:28:14 -05:00 · 2023-10-20 00:28:14 -05:00 · 32984ea2f0
commit 32984ea2f0
parent bb71272903
6 changed files with 111 additions and 18 deletions
--- a/extensions/multimodal/script.py
+++ b/extensions/multimodal/script.py
@ -46,23 +46,24 @@ def chat_input_modifier(text, visible_text, state):

 def add_chat_picture(picture, text, visible_text):
    # resize the image, so that shortest edge is at least 224 (size for CLIP), and at most 300 (to keep history manageable)
+    # Adjusted to 336 for the values here, due to the increased resolution in llava-v1.5
    max_hw, min_hw = max(picture.size), min(picture.size)
    aspect_ratio = max_hw / min_hw
-    shortest_edge = int(max(300 / aspect_ratio, 224))
+    shortest_edge = int(max(336 / aspect_ratio, 336))
    longest_edge = int(shortest_edge * aspect_ratio)
    w = shortest_edge if picture.width < picture.height else longest_edge
    h = shortest_edge if picture.width >= picture.height else longest_edge
    picture = picture.resize((w, h))

    buffer = BytesIO()
-    picture.save(buffer, format="JPEG")
+    picture.save(buffer, format="PNG")
    img_str = base64.b64encode(buffer.getvalue()).decode('utf-8')
    image = f'<img src="data:image/jpeg;base64,{img_str}">'

    if '<image>' in text:
        text = text.replace('<image>', image)
    else:
-        text = text + '\n' + image
+        text = image + '\n' + text

    if visible_text == '' or visible_text is None:
        visible_text = text