openslr

 
http://openslr.org/

OpenSLR is a site devoted to hosting speech and language resources, 
such as training corpora for speech recognition, 
and software related to speech recognition. 

aishell

 
https://aishelltech.com/kysjcp
    

aishell1数据下载 

nohup wget https://openslr.elda.org/resources/33/data_aishell.tgz --no-check-certificate &

wget https://openslr.elda.org/resources/33/resource_aishell.tgz

ssh xt@101.43.140.244:/opt/soft 
ssh -p 26225 144.34.185.72
rsync -e 'ssh -p26225' -avP 144.34.185.72:/opt/soft/* ./ 


 

    

 
(base) xt@ai:~/data/data_aishell$ ll
total 28
drwxr-xr-x 4 xt xt  4096  6月 16  2017 ./
drwxrwxr-x 3 xt xt  4096  4月 12 16:15 ../
drwxr-xr-x 2 xt xt  4096  7月  5  2017 transcript/
drwxr-xr-x 2 xt xt 16384  6月 16  2017 wav/
    

transcript

 
(base) xt@ai:~/data/data_aishell$ ll transcript/
total 9864
drwxr-xr-x 2 xt xt     4096  7月  5  2017 ./
drwxr-xr-x 4 xt xt     4096  6月 16  2017 ../
-rw-r--r-- 1 xt xt 10091431  7月  5  2017 aishell_transcript_v0.8.txt

    
(base) xt@ai:~/data/data_aishell$ tail -f -n 3 transcript/aishell_transcript_v0.8.txt 
BAC009S0916W0493     公司  却  遇到  了  资金  困难  
BAC009S0916W0494     存在  无法  如期  还贷  的  风险  
BAC009S0916W0495     这  令  被  贷款  的  员工  们  寝食  难  安 

wav

 
$ ls wav/
S0002.tar.gz  S0042.tar.gz  S0082.tar.gz
    
(base) xt@ai:~/data/data_aishell$ ls wav/ | wc -l
400


(base) xt@ai:~/data/data_aishell$ cp wav/S0002.tar.gz /tmp/
(base) xt@ai:~/data/data_aishell$ 
(base) xt@ai:~/data/data_aishell$ cd /tmp/
(base) xt@ai:/tmp$ tar -xvf S0002.tar.gz 
train/S0002/
train/S0002/BAC009S0002W0122.wav
train/S0002/BAC009S0002W0123.wav
train/S0002/BAC009S0002W0124.wav
train/S0002/BAC009S0002W0125.wav
train/S0002/BAC009S0002W0126.wav
...
...
(base) xt@ai:/tmp/train$ ls /tmp/train/S0002/ |wc -l
365


(base) xt@ai:~/data/data_aishell/wav$ cp S0916.tar.gz /tmp/data/
(base) xt@ai:~/data/data_aishell/wav$ cd /tmp/data/
(base) xt@ai:/tmp/data$ ls
S0916.tar.gz
(base) xt@ai:/tmp/data$ tar -xvf S0916.tar.gz 
test/S0916/
test/S0916/BAC009S0916W0121.wav
test/S0916/BAC009S0916W0122.wav

(base) xt@ai:/tmp/data/test/S0916$ ls /tmp/data/test/S0916/|wc -l
361

vim get_data.sh

 
#!/bin/bash

# 遍历data目录下的所有.tar.gz文件
for file in /home/xt/data/data_aishell/wav/*.tar.gz; do
    # 检查文件是否存在
    if [ -f "$file" ]; then
    # 解压文件
    tar -xvf $file
    else
    echo "File does not exist: $file"
    fi
done

 
[root@ki wav]# ls dev/
S0724  S0726  S0728  S0730  S0732  S0734  S0736  S0738  S0740  S0742  S0744  S0746  S0748  S0750  S0752  S0754  S0756  S0758  S0760  S0762
S0725  S0727  S0729  S0731  S0733  S0735  S0737  S0739  S0741  S0743  S0745  S0747  S0749  S0751  S0753  S0755  S0757  S0759  S0761  S0763
[root@ki wav]# ls test/
S0764  S0765  S0766  S0767  S0768  S0769  S0770  S0901  S0902  S0903  S0904  S0905  S0906  S0907  S0908  S0912  S0913  S0914  S0915  S0916
[root@ki wav]# ls train/
S0002  S0014  S0026  S0038  S0050  S0062  S0074  S0086  S0098  S0110  S0122  S0134  S0146  S0158  S0170  S0182  S0194  S0206  S0218  S0230  S0242  S0335  S0347  S0359  S0513  S0597  S0662  S0708  S0720
S0003  S0015  S0027  S0039  S0051  S0063  S0075  S0087  S0099  S0111  S0123  S0135  S0147  S0159  S0171  S0183  S0195  S0207  S0219  S0231  S0243  S0336  S0348  S0360  S0514  S0598  S0663  S0709  S0721
S0004  S0016  S0028  S0040  S0052  S0064  S0076  S0088  S0100  S0112  S0124  S0136  S0148  S0160  S0172  S0184  S0196  S0208  S0220  S0232  S0244  S0337  S0349  S0361  S0515  S0599  S0664  S0710  S0722
S0005  S0017  S0029  S0041  S0053  S0065  S0077  S0089  S0101  S0113  S0125  S0137  S0149  S0161  S0173  S0185  S0197  S0209  S0221  S0233  S0245  S0338  S0350  S0362  S0516  S0600  S0665  S0711  S0723
S0006  S0018  S0030  S0042  S0054  S0066  S0078  S0090  S0102  S0114  S0126  S0138  S0150  S0162  S0174  S0186  S0198  S0210  S0222  S0234  S0246  S0339  S0351  S0363  S0517  S0601  S0666  S0712
S0007  S0019  S0031  S0043  S0055  S0067  S0079  S0091  S0103  S0115  S0127  S0139  S0151  S0163  S0175  S0187  S0199  S0211  S0223  S0235  S0247  S0340  S0352  S0421  S0518  S0655  S0701  S0713
S0008  S0020  S0032  S0044  S0056  S0068  S0080  S0092  S0104  S0116  S0128  S0140  S0152  S0164  S0176  S0188  S0200  S0212  S0224  S0236  S0248  S0341  S0353  S0422  S0519  S0656  S0702  S0714
S0009  S0021  S0033  S0045  S0057  S0069  S0081  S0093  S0105  S0117  S0129  S0141  S0153  S0165  S0177  S0189  S0201  S0213  S0225  S0237  S0249  S0342  S0354  S0423  S0520  S0657  S0703  S0715
S0010  S0022  S0034  S0046  S0058  S0070  S0082  S0094  S0106  S0118  S0130  S0142  S0154  S0166  S0178  S0190  S0202  S0214  S0226  S0238  S0250  S0343  S0355  S0509  S0521  S0658  S0704  S0716
S0011  S0023  S0035  S0047  S0059  S0071  S0083  S0095  S0107  S0119  S0131  S0143  S0155  S0167  S0179  S0191  S0203  S0215  S0227  S0239  S0251  S0344  S0356  S0510  S0522  S0659  S0705  S0717
S0012  S0024  S0036  S0048  S0060  S0072  S0084  S0096  S0108  S0120  S0132  S0144  S0156  S0168  S0180  S0192  S0204  S0216  S0228  S0240  S0252  S0345  S0357  S0511  S0523  S0660  S0706  S0718
S0013  S0025  S0037  S0049  S0061  S0073  S0085  S0097  S0109  S0121  S0133  S0145  S0157  S0169  S0181  S0193  S0205  S0217  S0229  S0241  S0334  S0346  S0358  S0512  S0596  S0661  S0707  S0719
[root@ki wav]# pwd
/data/wave/aishell/data_aishell/wav

    

 

  

 

  

 


THCHS-30

 
https://www.openslr.org/resources/18/data_thchs30.tgz
https://www.openslr.org/resources/18/test-noise.tgz
https://www.openslr.org/resources/18/resource.tgz

CONTACTOR

 
CONTACTOR

    Dong Wang wangdong99@mails.tsinghua.edu.cn
    Xuewei Zhang zxw@cslt.riit.tsinghua.edu.cn
    Zhiyong Zhang zhangzy@cslt.riit.tsinghua.edu.cn 

ROOM1-303, BLDG FIT

CSLT, Tsinghua University

http://cslt.org

http://cslt.riit.tsinghua.edu.cn


External URLs:
http://data.cslt.org/thchs30/README.html   (Original URL from CSLT )
http://pan.baidu.com/s/1hqKwE00   ( Baidu disk ) 

 
xt@ai:/data/wave/data_thchs30$ ll
total 8261428
drwxrwxr-x 3 xt xt       4096  4月  5 19:22 ./
drwxr-xr-x 4 xt xt       4096  4月  5 18:27 ../
drwxr-xr-x 8 xt xt       4096 12月 30  2015 data_thchs30/
-rw-rw-r-- 1 xt xt 6453425169 10月  4  2017 data_thchs30.tgz
-rw------- 1 xt xt    9972323  1月  6 16:27 nohup.out
-rw-rw-r-- 1 xt xt   24813708 10月  4  2017 resource.tgz
-rw-rw-r-- 1 xt xt 1971460210 10月  4  2017 test-noise.tgz

xt@ai:/data/wave/data_thchs30$ cd data_thchs30/
xt@ai:/data/wave/data_thchs30/data_thchs30$ ls
data  dev  lm_phone  lm_word  README.TXT  test  train

    
wav48

 
/media/xt/DE22538E22536A93/data/yuyin/hanyu2/wav48

-rwxrwxrwx 1 1000 1000     81 Jan 25  2022 96
-rwxrwxrwx 1 1000 1000 566862 Jan 25  2022 96.wav
-rwxrwxrwx 1 1000 1000     91 Jan 25  2022 97
-rwxrwxrwx 1 1000 1000 698958 Jan 25  2022 97.wav
-rwxrwxrwx 1 1000 1000     53 Jan 25  2022 98
-rwxrwxrwx 1 1000 1000 569550 Jan 25  2022 98.wav
-rwxrwxrwx 1 1000 1000     68 Jan 25  2022 99
-rwxrwxrwx 1 1000 1000 481230 Jan 25  2022 99.wav
[root@ki wav48]# 


 

    

 


 

  

 


参考