房价

回归问题

官方新方法封装

 
from tpf.datasets import load_boston
X_train, y_train, X_test, y_test = load_boston(split=True,test_size=0.15)

 
shape:(430, 13), type:class 'numpy.ndarray', X_train
shape:(430,), type:class 'numpy.ndarray', y_train
    

官方旧方法

 
from sklearn.datasets import load_boston 
X,y = load_boston(return_X_y=True)

==============   ==============
Samples total               506
Dimensionality               13
Features         real, positive
Targets           real 5. - 50.
==============   ==============
    
 
Dimensionality:维度,13个 
Features 特征:实数,正的 
Targets  标签:实数,浮点数,[5.0,50.0]

 
Signature: load_boston(*, return_X_y=False)
Docstring:
DEPRECATED: `load_boston` is deprecated in 1.0 and will be removed in 1.2.

return_X_y=True
返回一个包含各种信息的pandas数据集,还需要自己从中提取特征列,标签 

X:大写字母,意味这是一个矩阵,至少是二维
y:意味这是一个向量,一维 
    
乳腺癌

 
from sklearn.datasets import load_breast_cancer  
from sklearn.model_selection import train_test_split  

# 加载数据集  
data = load_breast_cancer()  
X = data.data  
y = data.target  
    
# 划分训练集和测试集  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  
    
    

 
X_train.shape,y_train.shape
((455, 30), (455,))

二分类问题:ruxianai


自定义一个数据集ds,封装一些常用小数据集

data = datasets.load_breast_cancer()
X = data.data  # numpy 
y = data.target
X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

 

    

 
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# 加载乳腺癌数据集
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.DataFrame(data.target,columns=['target'])

columns = X.columns
df = pd.concat([X,y],axis=1)

 


 


 

  

 


鸢尾花

多分类问题:iris

    
def load_iris(split=True):
    """
    split:True,拆分数据集为训练集与测试集,False为不拆分
    data_list = ds.load_iris()

    或

    X_train, y_train, X_test, y_test = ds.load_iris()

    或

    X_train, y_train = ds.load_iris(split=False)
    """
    data = datasets.load_iris()
    if split:
        x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.20, random_state=73)
        return x_train, y_train, x_test, y_test
    else:
        return data.data, data.target

参考