From 768354239b4437eb2fa739effdf8461cdd91cb6a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 7 Apr 2023 11:15:52 -0300 Subject: [PATCH] Change training file encoding --- modules/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/training.py b/modules/training.py index 9880cf0..5107284 100644 --- a/modules/training.py +++ b/modules/training.py @@ -152,7 +152,7 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int # == Prep the dataset, format, etc == if raw_text_file not in ['None', '']: print("Loading raw text file dataset...") - with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r') as file: + with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file: raw_text = file.read() tokens = shared.tokenizer.encode(raw_text) del raw_text # Note: could be a gig for a large dataset, so delete redundant data as we go to be safe on RAM