Add ability to load all text files from a subdirectory for training (#1997)
* Update utils.py returns individual txt files and subdirectories to getdatasets to allow for training from a directory of text files * Update training.py minor tweak to training on raw datasets to detect if a directory is selected, and if so, to load in all the txt files in that directory for training * Update put-trainer-datasets-here.txt document * Minor change * Use pathlib, sort by natural keys * Space --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com>
This commit is contained in:
parent
73a0def4af
commit
5d513eea22
3 changed files with 22 additions and 5 deletions
|
@ -114,6 +114,10 @@ def get_available_loras():
|
|||
|
||||
|
||||
def get_datasets(path: str, ext: str):
|
||||
# include subdirectories for raw txt files to allow training from a subdirectory of txt files
|
||||
if ext == "txt":
|
||||
return ['None'] + sorted(set([k.stem for k in list(Path(path).glob('txt'))+list(Path(path).glob('*/')) if k.stem != 'put-trainer-datasets-here']), key=natural_keys)
|
||||
|
||||
return ['None'] + sorted(set([k.stem for k in Path(path).glob(f'*.{ext}') if k.stem != 'put-trainer-datasets-here']), key=natural_keys)
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue