Qwen3-VL Full Fine-Tuning Guide
This note documents a practical workflow for Qwen3-VL full fine-tuning, including environment preparation, training execution, and result evaluation.
1. Background
Since the official repository does not provide a complete model-performance validation script, this guide supplements the evaluation script and provides a testing workflow. For the complete full fine-tuning project files, refer to my repository: Qwen3-VL-2B-FullFinetune.
2. Preparation
Before you begin, complete the environment setup according to the official documentation.
Prepare the following resources:
- Dataset: llava-instruct-mix-vsft. The downloaded format is parquet, so you need to convert it to json first.
- Base model weights: Qwen3-VL-2B-Instruct.
- Clone the project code. Run the following command in your working directory:
git clone https://github.com/2U1/Qwen-VL-Series-Finetune.git
Note: Parquet (Apache Parquet) is a columnar storage format for analytical workloads and is widely used in big-data and data-warehouse scenarios.
3. Run Full Fine-Tuning
First, update MODEL_NAME, data_path, and image_folder in finetune.sh, then run:
# Note: Replace the following paths with your local machine paths.
python -m venv <env-name>
source <venv-path>/bin/activate <env-name>
cd <your-project-path>/Qwen-VL-Series-Finetune/
bash scripts/finetune.sh
4. Prepare the Evaluation Script
Create eval_fft_test.py and paste the following content:
#!/usr/bin/env python3
import argparse
import json
import random
import time
from pathlib import Path
import torch
from qwen_vl_utils import process_vision_info
from src.utils import disable_torch_init, get_model_name_from_path, load_pretrained_model
def normalize_text(text: str) -> str:
return " ".join((text or "").strip().lower().split())
def build_user_content(sample: dict, user_text: str, image_root: Path, turn_idx: int) -> list:
content = []
user_text = user_text or ""
image_rel = sample.get("image")
include_image = ("<image>" in user_text) or (turn_idx == 0 and image_rel)
if include_image and image_rel:
image_path = image_root / image_rel
if image_path.exists():
content.append({"type": "image", "image": str(image_path)})
cleaned_text = user_text.replace("<image>", "").strip()
if cleaned_text:
content.append({"type": "text", "text": cleaned_text})
if not content:
content = [{"type": "text", "text": ""}]
return content
def generate_answer(messages: list, processor, model, device: str, max_new_tokens: int, temperature: float, top_p: float, repetition_penalty: float) -> str:
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[prompt],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to(device)
generation_kwargs = dict(
max_new_tokens=max_new_tokens,
repetition_penalty=repetition_penalty,
eos_token_id=processor.tokenizer.eos_token_id,
)
if temperature > 0:
generation_kwargs.update(dict(do_sample=True, temperature=temperature, top_p=top_p))
else:
generation_kwargs.update(dict(do_sample=False))
with torch.inference_mode():
outputs = model.generate(**inputs, **generation_kwargs)
generated_ids = outputs[:, inputs.input_ids.shape[1]:]
pred = processor.batch_decode(
generated_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0]
return pred.strip()
def main():
parser = argparse.ArgumentParser(description="Batch evaluation for Qwen-VL SFT test set.")
parser.add_argument("--model-path", type=str, required=True, help="Path to finetuned model (e.g. output/test_fft).")
parser.add_argument("--test-json", type=str, required=True, help="Path to test data.json.")
parser.add_argument("--image-root", type=str, required=True, help="Root folder for image relative paths in json.")
parser.add_argument("--model-base", type=str, default=None, help="Only needed for LoRA adapter checkpoints.")
parser.add_argument("--device", type=str, default="cuda")
parser.add_argument("--max-samples", type=int, default=100, help="How many samples to evaluate. Use -1 for all.")
parser.add_argument("--sample-mode", type=str, default="first", choices=["first", "random"])
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--max-new-tokens", type=int, default=128)
parser.add_argument("--temperature", type=float, default=0.0)
parser.add_argument("--top-p", type=float, default=0.9)
parser.add_argument("--repetition-penalty", type=float, default=1.0)
parser.add_argument("--save-preds", action="store_true", help="Save per-turn predictions to jsonl.")
parser.add_argument("--pred-output", type=str, default="output/eval_test_preds.jsonl")
parser.add_argument("--summary-output", type=str, default="output/eval_test_summary.json")
parser.add_argument("--log-every", type=int, default=100, help="Print progress every N evaluated turns.")
parser.add_argument("--resume", action="store_true", help="Resume from existing pred-output jsonl.")
args = parser.parse_args()
disable_torch_init()
model_name = get_model_name_from_path(args.model_path)
processor, model = load_pretrained_model(
model_path=args.model_path,
model_base=args.model_base,
model_name=model_name,
device_map=args.device,
device=args.device,
use_flash_attn=True,
)
model.eval()
test_json = Path(args.test_json)
image_root = Path(args.image_root)
with test_json.open("r", encoding="utf-8") as f:
samples = json.load(f)
if args.max_samples > 0 and args.max_samples < len(samples):
if args.sample_mode == "random":
random.seed(args.seed)
indices = random.sample(range(len(samples)), args.max_samples)
eval_samples = [samples[i] for i in indices]
else:
eval_samples = samples[:args.max_samples]
else:
eval_samples = samples
pred_f = None
done_keys = set()
resumed_turns = 0
resumed_exact = 0
resumed_contains = 0
if args.resume and args.save_preds:
pred_path = Path(args.pred_output)
if pred_path.exists():
with pred_path.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
row = json.loads(line)
except json.JSONDecodeError:
continue
key = (row.get("sample_index"), row.get("turn_index"))
done_keys.add(key)
resumed_turns += 1
resumed_exact += int(row.get("exact_match", 0))
resumed_contains += int(row.get("contains_match", 0))
if args.save_preds:
pred_path = Path(args.pred_output)
pred_path.parent.mkdir(parents=True, exist_ok=True)
mode = "a" if args.resume else "w"
pred_f = pred_path.open(mode, encoding="utf-8")
total_turns = resumed_turns
exact_match = resumed_exact
contains_match = resumed_contains
new_eval_turns = 0
start = time.time()
for sample_idx, sample in enumerate(eval_samples):
conversations = sample.get("conversations", [])
messages = []
for turn_idx, turn in enumerate(conversations):
role = turn.get("from")
value = (turn.get("value") or "").strip()
if role == "human":
user_content = build_user_content(sample, value, image_root, turn_idx)
messages.append({"role": "user", "content": user_content})
continue
if role != "gpt":
continue
if not messages:
continue
ref_text = value
key = (sample_idx, turn_idx)
if key in done_keys:
messages.append({"role": "assistant", "content": [{"type": "text", "text": ref_text}]})
continue
pred_text = generate_answer(
messages=messages,
processor=processor,
model=model,
device=args.device,
max_new_tokens=args.max_new_tokens,
temperature=args.temperature,
top_p=args.top_p,
repetition_penalty=args.repetition_penalty,
)
total_turns += 1
new_eval_turns += 1
norm_pred = normalize_text(pred_text)
norm_ref = normalize_text(ref_text)
is_exact = int(norm_pred == norm_ref)
is_contains = int(norm_ref in norm_pred) if norm_ref else 0
exact_match += is_exact
contains_match += is_contains
if args.log_every > 0 and (new_eval_turns % args.log_every == 0):
elapsed = time.time() - start
speed = new_eval_turns / elapsed if elapsed > 0 else 0.0
print(
f"[progress] new_eval_turns={new_eval_turns} total_turns={total_turns} "
f"exact_match={exact_match / total_turns:.6f} contains_match={contains_match / total_turns:.6f} "
f"speed={speed:.3f} turns/s elapsed_sec={elapsed:.1f}",
flush=True,
)
if pred_f is not None:
record = {
"sample_index": sample_idx,
"turn_index": turn_idx,
"image": sample.get("image"),
"prediction": pred_text,
"reference": ref_text,
"exact_match": is_exact,
"contains_match": is_contains,
}
pred_f.write(json.dumps(record, ensure_ascii=False) + "\n")
# Keep gold assistant turn in context for next user turn.
messages.append({"role": "assistant", "content": [{"type": "text", "text": ref_text}]})
if pred_f is not None:
pred_f.close()
elapsed = time.time() - start
summary = {
"num_samples": len(eval_samples),
"num_eval_turns": total_turns,
"num_new_eval_turns": new_eval_turns,
"exact_match": (exact_match / total_turns) if total_turns else 0.0,
"contains_match": (contains_match / total_turns) if total_turns else 0.0,
"elapsed_sec": elapsed,
}
summary_path = Path(args.summary_output)
summary_path.parent.mkdir(parents=True, exist_ok=True)
with summary_path.open("w", encoding="utf-8") as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
print(json.dumps(summary, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()
5. Run Evaluation
Run the following command to evaluate on the full test set:
# Note: Replace the following paths with your local machine paths.
source <your-conda-install-path>/bin/activate env_qwen3vl
cd <your-project-root>/Qwen-VL-Series-Finetune/
export PYTHONPATH=$PWD:$PYTHONPATH
python scripts/eval_sft_test.py \
--model-path output/test_fft \
--test-json <your-dataset-root>/data-converted/test/data.json \
--image-root <your-dataset-root>/data-converted/test \
--max-samples -1 \
--max-new-tokens 128 \
--save-preds \
--pred-output output/test_fft/<your-experiment-name>/eval_preds_all.jsonl \
--summary-output output/test_fft/<your-experiment-name>/eval_summary_all.json \
--log-every 100 \
--resume
6. Result Interpretation and Notes
- It is recommended to run a smoke test with a small sample set (for example, 200 samples) before running full evaluation.
- It is recommended to run a baseline as well, so you can compare it with the fine-tuned model.
exact_matchranges from0to1, where1means the prediction exactly matches the reference answer.- Both training and evaluation outputs are stored in the
outputdirectory.