代码拉取完成,页面将自动刷新
import argparse
import json
import os
import functools
import soundfile
from tqdm import tqdm
from utils.utils import download, unpack
from utils.utils import add_arguments, print_arguments
DATA_URL = 'https://openslr.elda.org/resources/33/data_aishell.tgz'
MD5_DATA = '2f494334227864a8a8fec932999db9d8'
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
add_arg("filepath", default=None, type=str, help="压缩包data_aishell.tgz文件路径,不指定会自动下载")
add_arg("target_dir", default="dataset/audio/", type=str, help="存放音频文件的目录")
add_arg("annotation_text", default="dataset/", type=str, help="存放音频标注文件的目录")
add_arg('add_pun', default=False, type=bool, help="是否添加标点符")
args = parser.parse_args()
def create_annotation_text(data_dir, annotation_path):
print('Create Aishell annotation text ...')
if args.add_pun:
import logging
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger
logger = get_logger(log_level=logging.CRITICAL)
logger.setLevel(logging.CRITICAL)
inference_pipline = pipeline(task=Tasks.punctuation,
model='damo/punc_ct-transformer_cn-en-common-vocab471067-large',
model_revision="v1.0.0")
if not os.path.exists(annotation_path):
os.makedirs(annotation_path)
f_train = open(os.path.join(annotation_path, 'train.json'), 'w', encoding='utf-8')
f_test = open(os.path.join(annotation_path, 'test.json'), 'w', encoding='utf-8')
transcript_path = os.path.join(data_dir, 'transcript', 'aishell_transcript_v0.8.txt')
transcript_dict = {}
with open(transcript_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in tqdm(lines):
line = line.strip()
if line == '': continue
audio_id, text = line.split(' ', 1)
# remove space
text = ''.join(text.split())
if args.add_pun:
text = inference_pipline(text_in=text)['text']
transcript_dict[audio_id] = text
# 训练集
data_types = ['train', 'dev']
lines = []
for type in data_types:
audio_dir = os.path.join(data_dir, 'wav', type)
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for fname in filelist:
audio_path = os.path.join(subfolder, fname)
audio_id = fname[:-4]
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
continue
text = transcript_dict[audio_id]
line = {"audio": {"path": audio_path}, "sentence": text}
lines.append(line)
# 添加音频时长
for i in tqdm(range(len(lines))):
audio_path = lines[i]['audio']['path']
sample, sr = soundfile.read(audio_path)
duration = round(sample.shape[-1] / float(sr), 2)
lines[i]["duration"] = duration
lines[i]["sentences"] = [{"start": 0, "end": duration, "text": lines[i]["sentence"]}]
for line in lines:
f_train.write(json.dumps(line, ensure_ascii=False) + "\n")
# 测试集
audio_dir = os.path.join(data_dir, 'wav', 'test')
lines = []
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for fname in filelist:
audio_path = os.path.join(subfolder, fname)
audio_id = fname[:-4]
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
continue
text = transcript_dict[audio_id]
line = {"audio": {"path": audio_path}, "sentence": text}
lines.append(line)
# 添加音频时长
for i in tqdm(range(len(lines))):
audio_path = lines[i]['audio']['path']
sample, sr = soundfile.read(audio_path)
duration = round(sample.shape[-1] / float(sr), 2)
lines[i]["duration"] = duration
lines[i]["sentences"] = [{"start": 0, "end": duration, "text": lines[i]["sentence"]}]
for line in lines:
f_test.write(json.dumps(line, ensure_ascii=False)+"\n")
f_test.close()
f_train.close()
def prepare_dataset(url, md5sum, target_dir, annotation_path, filepath=None):
"""Download, unpack and create manifest file."""
data_dir = os.path.join(target_dir, 'data_aishell')
if not os.path.exists(data_dir):
if filepath is None:
filepath = download(url, md5sum, target_dir)
unpack(filepath, target_dir)
# unpack all audio tar files
audio_dir = os.path.join(data_dir, 'wav')
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for ftar in filelist:
unpack(os.path.join(subfolder, ftar), subfolder, True)
os.remove(filepath)
else:
print("Skip downloading and unpacking. Aishell data already exists in %s." % target_dir)
create_annotation_text(data_dir, annotation_path)
def main():
print_arguments(args)
if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir)
prepare_dataset(url=DATA_URL,
md5sum=MD5_DATA,
target_dir=args.target_dir,
annotation_path=args.annotation_text,
filepath=args.filepath)
if __name__ == '__main__':
main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。