python random

[a, b],左右皆闭

 
import random
# Return random integer in range [a, b], including both end points.
random.randint(1,2)
1 

random.randint(1,2)
2 

numpy 概率抽取

 
x = np.random.choice(words, size=n, replace=True, p=p)

replace=True表示有放回抽样
- 有放回抽样可以抽取任意个数量的数据

np.random.choice:Generates a random sample from a given 1-D array

 
import numpy as np 
import random

seq_len=(30, 48)

# 单词集合,对应键盘上的字母
words = [
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 
    'q', 'w', 'e', 'r',  't', 'y',  'u', 'i', 'o', 'p', 
    'a', 's', 'd', 'f',  'g', 'h',  'j', 'k', 'l', 
    'z', 'x', 'c', 'v',  'b', 'n',  'm'
]
    
# 每个词被选中的概率,随机初始化的概率
p = np.array([
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
    1,   2,  3,  4,  5,  6,  7,  8,  9, 10, 
    11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
])

# 转概率,所有单词的概率之和为1
p = p / p.sum()

# 随机选n个词
# Return random integer in range [a, b], including both end points.
n = random.randint(seq_len[0], seq_len[1])
x = np.random.choice(words, size=n, replace=True, p=p)

 
x
array(['f', 'g', 'v', 'v', 's', 'z', '4', 'f', 'n', 'z', 'k', 'e', 'p',
       'x', 'g', 'o', 'j', 'f', 's', 'z', 'c', 'l', '4', 'k', 'n', '8',
       'm', 'k', 'h', 'f', 'p', 's', 'i', 's', 'c', 'a', 'x', 'v', 'o',
       'm', 'a'], dtype='<U1')

参数说明

 
重点说明两个参数:大小size,可放回replace 

size : int or tuple of ints, optional
    Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
    ``m * n * k`` samples are drawn.  Default is None, in which case a
    single value is returned

x = np.random.choice(words, size=(5,10), replace=True, p=p)
x.shape
(5, 10)


replace : boolean, optional
    Whether the sample is with or without replacement. Default is True,
    meaning that a value of ``a`` can be selected multiple times.
replace:是否放回,默认True,即放回,意味着一个样本可被多次提取

pandas 综合抽样

 
import torch 
import pandas as pd 

a=torch.linspace(start=1,end=120,steps=120).reshape(30,4)
a = pd.DataFrame(a,columns=["A","B","C","D"])
print(a)

# 读取数据集
df = a

# 随机抽取10行,若数据真实行数低于10行则报错 
df.sample(n=10)

# 随机抽取20%的行
df.sample(frac=0.2)

# 允许重复抽取
df.sample(n=10, replace=True)

# 为每个行设置不同的权重
weights = torch.rand(30)
df.sample(n=10, weights=weights)

# 设置随机数种子
df.sample(n=10, random_state=73)

参考