{ "cells": [ { "cell_type": "markdown", "id": "fafe7b07", "metadata": {}, "source": [ "***Decision tree***\n", "- overfitting(low bias,high variance) \n", "\n", "***Random forest*** \n", "\n", "- Ensemble classifier\n", "- group of decision trees can be called as random forest\n", "- we will consider the output of each decision tree and define the highest repeated times output as the predicted ouput" ] }, { "cell_type": "code", "execution_count": 1, "id": "d71f4c2b", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "df=pd.read_csv(\"C:/Users/HP/Downloads/archive(3)/heart.csv\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "dde9a1e4", "metadata": { "collapsed": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agesexcptrestbpscholfbsrestecgthalachexangoldpeakslopecathaltarget
063131452331015002.30011
137121302500118703.50021
241011302040017201.42021
356111202360117800.82021
457001203540116310.62021
\n", "
" ], "text/plain": [ " age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \\\n", "0 63 1 3 145 233 1 0 150 0 2.3 0 \n", "1 37 1 2 130 250 0 1 187 0 3.5 0 \n", "2 41 0 1 130 204 0 0 172 0 1.4 2 \n", "3 56 1 1 120 236 0 1 178 0 0.8 2 \n", "4 57 0 0 120 354 0 1 163 1 0.6 2 \n", "\n", " ca thal target \n", "0 0 1 1 \n", "1 0 2 1 \n", "2 0 2 1 \n", "3 0 2 1 \n", "4 0 2 1 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "id": "fd41bea5", "metadata": {}, "outputs": [], "source": [ "x=df.drop(\"target\",axis=1)\n", "y=df[\"target\"]" ] }, { "cell_type": "code", "execution_count": 4, "id": "4edd276a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier()" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import train_test_split\n", "x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=42)\n", "from sklearn.ensemble import RandomForestClassifier\n", "model=RandomForestClassifier()\n", "model.fit(x_train,y_train)" ] }, { "cell_type": "code", "execution_count": 5, "id": "51716001", "metadata": {}, "outputs": [], "source": [ "y_pred=model.predict(x_test)" ] }, { "cell_type": "code", "execution_count": 6, "id": "d64f26a8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8021978021978022" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics import accuracy_score\n", "accuracy_score(y_test,y_pred)" ] }, { "cell_type": "code", "execution_count": 16, "id": "e4a1ed49", "metadata": { "collapsed": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on RandomForestClassifier in module sklearn.ensemble._forest object:\n", "\n", "class RandomForestClassifier(ForestClassifier)\n", " | RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)\n", " | \n", " | A random forest classifier.\n", " | \n", " | A random forest is a meta estimator that fits a number of decision tree\n", " | classifiers on various sub-samples of the dataset and uses averaging to\n", " | improve the predictive accuracy and control over-fitting.\n", " | The sub-sample size is controlled with the `max_samples` parameter if\n", " | `bootstrap=True` (default), otherwise the whole dataset is used to build\n", " | each tree.\n", " | \n", " | Read more in the :ref:`User Guide `.\n", " | \n", " | Parameters\n", " | ----------\n", " | n_estimators : int, default=100\n", " | The number of trees in the forest.\n", " | \n", " | .. versionchanged:: 0.22\n", " | The default value of ``n_estimators`` changed from 10 to 100\n", " | in 0.22.\n", " | \n", " | criterion : {\"gini\", \"entropy\"}, default=\"gini\"\n", " | The function to measure the quality of a split. Supported criteria are\n", " | \"gini\" for the Gini impurity and \"entropy\" for the information gain.\n", " | Note: this parameter is tree-specific.\n", " | \n", " | max_depth : int, default=None\n", " | The maximum depth of the tree. If None, then nodes are expanded until\n", " | all leaves are pure or until all leaves contain less than\n", " | min_samples_split samples.\n", " | \n", " | min_samples_split : int or float, default=2\n", " | The minimum number of samples required to split an internal node:\n", " | \n", " | - If int, then consider `min_samples_split` as the minimum number.\n", " | - If float, then `min_samples_split` is a fraction and\n", " | `ceil(min_samples_split * n_samples)` are the minimum\n", " | number of samples for each split.\n", " | \n", " | .. versionchanged:: 0.18\n", " | Added float values for fractions.\n", " | \n", " | min_samples_leaf : int or float, default=1\n", " | The minimum number of samples required to be at a leaf node.\n", " | A split point at any depth will only be considered if it leaves at\n", " | least ``min_samples_leaf`` training samples in each of the left and\n", " | right branches. This may have the effect of smoothing the model,\n", " | especially in regression.\n", " | \n", " | - If int, then consider `min_samples_leaf` as the minimum number.\n", " | - If float, then `min_samples_leaf` is a fraction and\n", " | `ceil(min_samples_leaf * n_samples)` are the minimum\n", " | number of samples for each node.\n", " | \n", " | .. versionchanged:: 0.18\n", " | Added float values for fractions.\n", " | \n", " | min_weight_fraction_leaf : float, default=0.0\n", " | The minimum weighted fraction of the sum total of weights (of all\n", " | the input samples) required to be at a leaf node. Samples have\n", " | equal weight when sample_weight is not provided.\n", " | \n", " | max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n", " | The number of features to consider when looking for the best split:\n", " | \n", " | - If int, then consider `max_features` features at each split.\n", " | - If float, then `max_features` is a fraction and\n", " | `round(max_features * n_features)` features are considered at each\n", " | split.\n", " | - If \"auto\", then `max_features=sqrt(n_features)`.\n", " | - If \"sqrt\", then `max_features=sqrt(n_features)` (same as \"auto\").\n", " | - If \"log2\", then `max_features=log2(n_features)`.\n", " | - If None, then `max_features=n_features`.\n", " | \n", " | Note: the search for a split does not stop until at least one\n", " | valid partition of the node samples is found, even if it requires to\n", " | effectively inspect more than ``max_features`` features.\n", " | \n", " | max_leaf_nodes : int, default=None\n", " | Grow trees with ``max_leaf_nodes`` in best-first fashion.\n", " | Best nodes are defined as relative reduction in impurity.\n", " | If None then unlimited number of leaf nodes.\n", " | \n", " | min_impurity_decrease : float, default=0.0\n", " | A node will be split if this split induces a decrease of the impurity\n", " | greater than or equal to this value.\n", " | \n", " | The weighted impurity decrease equation is the following::\n", " | \n", " | N_t / N * (impurity - N_t_R / N_t * right_impurity\n", " | - N_t_L / N_t * left_impurity)\n", " | \n", " | where ``N`` is the total number of samples, ``N_t`` is the number of\n", " | samples at the current node, ``N_t_L`` is the number of samples in the\n", " | left child, and ``N_t_R`` is the number of samples in the right child.\n", " | \n", " | ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n", " | if ``sample_weight`` is passed.\n", " | \n", " | .. versionadded:: 0.19\n", " | \n", " | min_impurity_split : float, default=None\n", " | Threshold for early stopping in tree growth. A node will split\n", " | if its impurity is above the threshold, otherwise it is a leaf.\n", " | \n", " | .. deprecated:: 0.19\n", " | ``min_impurity_split`` has been deprecated in favor of\n", " | ``min_impurity_decrease`` in 0.19. The default value of\n", " | ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it\n", " | will be removed in 1.0 (renaming of 0.25).\n", " | Use ``min_impurity_decrease`` instead.\n", " | \n", " | bootstrap : bool, default=True\n", " | Whether bootstrap samples are used when building trees. If False, the\n", " | whole dataset is used to build each tree.\n", " | \n", " | oob_score : bool, default=False\n", " | Whether to use out-of-bag samples to estimate\n", " | the generalization accuracy.\n", " | \n", " | n_jobs : int, default=None\n", " | The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n", " | :meth:`decision_path` and :meth:`apply` are all parallelized over the\n", " | trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n", " | context. ``-1`` means using all processors. See :term:`Glossary\n", " | ` for more details.\n", " | \n", " | random_state : int, RandomState instance or None, default=None\n", " | Controls both the randomness of the bootstrapping of the samples used\n", " | when building trees (if ``bootstrap=True``) and the sampling of the\n", " | features to consider when looking for the best split at each node\n", " | (if ``max_features < n_features``).\n", " | See :term:`Glossary ` for details.\n", " | \n", " | verbose : int, default=0\n", " | Controls the verbosity when fitting and predicting.\n", " | \n", " | warm_start : bool, default=False\n", " | When set to ``True``, reuse the solution of the previous call to fit\n", " | and add more estimators to the ensemble, otherwise, just fit a whole\n", " | new forest. See :term:`the Glossary `.\n", " | \n", " | class_weight : {\"balanced\", \"balanced_subsample\"}, dict or list of dicts, default=None\n", " | Weights associated with classes in the form ``{class_label: weight}``.\n", " | If not given, all classes are supposed to have weight one. For\n", " | multi-output problems, a list of dicts can be provided in the same\n", " | order as the columns of y.\n", " | \n", " | Note that for multioutput (including multilabel) weights should be\n", " | defined for each class of every column in its own dict. For example,\n", " | for four-class multilabel classification weights should be\n", " | [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n", " | [{1:1}, {2:5}, {3:1}, {4:1}].\n", " | \n", " | The \"balanced\" mode uses the values of y to automatically adjust\n", " | weights inversely proportional to class frequencies in the input data\n", " | as ``n_samples / (n_classes * np.bincount(y))``\n", " | \n", " | The \"balanced_subsample\" mode is the same as \"balanced\" except that\n", " | weights are computed based on the bootstrap sample for every tree\n", " | grown.\n", " | \n", " | For multi-output, the weights of each column of y will be multiplied.\n", " | \n", " | Note that these weights will be multiplied with sample_weight (passed\n", " | through the fit method) if sample_weight is specified.\n", " | \n", " | ccp_alpha : non-negative float, default=0.0\n", " | Complexity parameter used for Minimal Cost-Complexity Pruning. The\n", " | subtree with the largest cost complexity that is smaller than\n", " | ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n", " | :ref:`minimal_cost_complexity_pruning` for details.\n", " | \n", " | .. versionadded:: 0.22\n", " | \n", " | max_samples : int or float, default=None\n", " | If bootstrap is True, the number of samples to draw from X\n", " | to train each base estimator.\n", " | \n", " | - If None (default), then draw `X.shape[0]` samples.\n", " | - If int, then draw `max_samples` samples.\n", " | - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n", " | `max_samples` should be in the interval `(0, 1)`.\n", " | \n", " | .. versionadded:: 0.22\n", " | \n", " | Attributes\n", " | ----------\n", " | base_estimator_ : DecisionTreeClassifier\n", " | The child estimator template used to create the collection of fitted\n", " | sub-estimators.\n", " | \n", " | estimators_ : list of DecisionTreeClassifier\n", " | The collection of fitted sub-estimators.\n", " | \n", " | classes_ : ndarray of shape (n_classes,) or a list of such arrays\n", " | The classes labels (single output problem), or a list of arrays of\n", " | class labels (multi-output problem).\n", " | \n", " | n_classes_ : int or list\n", " | The number of classes (single output problem), or a list containing the\n", " | number of classes for each output (multi-output problem).\n", " | \n", " | n_features_ : int\n", " | The number of features when ``fit`` is performed.\n", " | \n", " | n_outputs_ : int\n", " | The number of outputs when ``fit`` is performed.\n", " | \n", " | feature_importances_ : ndarray of shape (n_features,)\n", " | The impurity-based feature importances.\n", " | The higher, the more important the feature.\n", " | The importance of a feature is computed as the (normalized)\n", " | total reduction of the criterion brought by that feature. It is also\n", " | known as the Gini importance.\n", " | \n", " | Warning: impurity-based feature importances can be misleading for\n", " | high cardinality features (many unique values). See\n", " | :func:`sklearn.inspection.permutation_importance` as an alternative.\n", " | \n", " | oob_score_ : float\n", " | Score of the training dataset obtained using an out-of-bag estimate.\n", " | This attribute exists only when ``oob_score`` is True.\n", " | \n", " | oob_decision_function_ : ndarray of shape (n_samples, n_classes)\n", " | Decision function computed with out-of-bag estimate on the training\n", " | set. If n_estimators is small it might be possible that a data point\n", " | was never left out during the bootstrap. In this case,\n", " | `oob_decision_function_` might contain NaN. This attribute exists\n", " | only when ``oob_score`` is True.\n", " | \n", " | See Also\n", " | --------\n", " | DecisionTreeClassifier, ExtraTreesClassifier\n", " | \n", " | Notes\n", " | -----\n", " | The default values for the parameters controlling the size of the trees\n", " | (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n", " | unpruned trees which can potentially be very large on some data sets. To\n", " | reduce memory consumption, the complexity and size of the trees should be\n", " | controlled by setting those parameter values.\n", " | \n", " | The features are always randomly permuted at each split. Therefore,\n", " | the best found split may vary, even with the same training data,\n", " | ``max_features=n_features`` and ``bootstrap=False``, if the improvement\n", " | of the criterion is identical for several splits enumerated during the\n", " | search of the best split. To obtain a deterministic behaviour during\n", " | fitting, ``random_state`` has to be fixed.\n", " | \n", " | References\n", " | ----------\n", " | .. [1] L. Breiman, \"Random Forests\", Machine Learning, 45(1), 5-32, 2001.\n", " | \n", " | Examples\n", " | --------\n", " | >>> from sklearn.ensemble import RandomForestClassifier\n", " | >>> from sklearn.datasets import make_classification\n", " | >>> X, y = make_classification(n_samples=1000, n_features=4,\n", " | ... n_informative=2, n_redundant=0,\n", " | ... random_state=0, shuffle=False)\n", " | >>> clf = RandomForestClassifier(max_depth=2, random_state=0)\n", " | >>> clf.fit(X, y)\n", " | RandomForestClassifier(...)\n", " | >>> print(clf.predict([[0, 0, 0, 0]]))\n", " | [1]\n", " | \n", " | Method resolution order:\n", " | RandomForestClassifier\n", " | ForestClassifier\n", " | sklearn.base.ClassifierMixin\n", " | BaseForest\n", " | sklearn.base.MultiOutputMixin\n", " | sklearn.ensemble._base.BaseEnsemble\n", " | sklearn.base.MetaEstimatorMixin\n", " | sklearn.base.BaseEstimator\n", " | builtins.object\n", " | \n", " | Methods defined here:\n", " | \n", " | __init__(self, n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)\n", " | Initialize self. See help(type(self)) for accurate signature.\n", " | \n", " | ----------------------------------------------------------------------\n", " | Data and other attributes defined here:\n", " | \n", " | __abstractmethods__ = frozenset()\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from ForestClassifier:\n", " | \n", " | predict(self, X)\n", " | Predict class for X.\n", " | \n", " | The predicted class of an input sample is a vote by the trees in\n", " | the forest, weighted by their probability estimates. That is,\n", " | the predicted class is the one with highest mean probability\n", " | estimate across the trees.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : {array-like, sparse matrix} of shape (n_samples, n_features)\n", " | The input samples. Internally, its dtype will be converted to\n", " | ``dtype=np.float32``. If a sparse matrix is provided, it will be\n", " | converted into a sparse ``csr_matrix``.\n", " | \n", " | Returns\n", " | -------\n", " | y : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n", " | The predicted classes.\n", " | \n", " | predict_log_proba(self, X)\n", " | Predict class log-probabilities for X.\n", " | \n", " | The predicted class log-probabilities of an input sample is computed as\n", " | the log of the mean predicted class probabilities of the trees in the\n", " | forest.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : {array-like, sparse matrix} of shape (n_samples, n_features)\n", " | The input samples. Internally, its dtype will be converted to\n", " | ``dtype=np.float32``. If a sparse matrix is provided, it will be\n", " | converted into a sparse ``csr_matrix``.\n", " | \n", " | Returns\n", " | -------\n", " | p : ndarray of shape (n_samples, n_classes), or a list of n_outputs\n", " | such arrays if n_outputs > 1.\n", " | The class probabilities of the input samples. The order of the\n", " | classes corresponds to that in the attribute :term:`classes_`.\n", " | \n", " | predict_proba(self, X)\n", " | Predict class probabilities for X.\n", " | \n", " | The predicted class probabilities of an input sample are computed as\n", " | the mean predicted class probabilities of the trees in the forest.\n", " | The class probability of a single tree is the fraction of samples of\n", " | the same class in a leaf.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : {array-like, sparse matrix} of shape (n_samples, n_features)\n", " | The input samples. Internally, its dtype will be converted to\n", " | ``dtype=np.float32``. If a sparse matrix is provided, it will be\n", " | converted into a sparse ``csr_matrix``.\n", " | \n", " | Returns\n", " | -------\n", " | p : ndarray of shape (n_samples, n_classes), or a list of n_outputs\n", " | such arrays if n_outputs > 1.\n", " | The class probabilities of the input samples. The order of the\n", " | classes corresponds to that in the attribute :term:`classes_`.\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from sklearn.base.ClassifierMixin:\n", " | \n", " | score(self, X, y, sample_weight=None)\n", " | Return the mean accuracy on the given test data and labels.\n", " | \n", " | In multi-label classification, this is the subset accuracy\n", " | which is a harsh metric since you require for each sample that\n", " | each label set be correctly predicted.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : array-like of shape (n_samples, n_features)\n", " | Test samples.\n", " | \n", " | y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n", " | True labels for `X`.\n", " | \n", " | sample_weight : array-like of shape (n_samples,), default=None\n", " | Sample weights.\n", " | \n", " | Returns\n", " | -------\n", " | score : float\n", " | Mean accuracy of ``self.predict(X)`` wrt. `y`.\n", " | \n", " | ----------------------------------------------------------------------\n", " | Data descriptors inherited from sklearn.base.ClassifierMixin:\n", " | \n", " | __dict__\n", " | dictionary for instance variables (if defined)\n", " | \n", " | __weakref__\n", " | list of weak references to the object (if defined)\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from BaseForest:\n", " | \n", " | apply(self, X)\n", " | Apply trees in the forest to X, return leaf indices.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : {array-like, sparse matrix} of shape (n_samples, n_features)\n", " | The input samples. Internally, its dtype will be converted to\n", " | ``dtype=np.float32``. If a sparse matrix is provided, it will be\n", " | converted into a sparse ``csr_matrix``.\n", " | \n", " | Returns\n", " | -------\n", " | X_leaves : ndarray of shape (n_samples, n_estimators)\n", " | For each datapoint x in X and for each tree in the forest,\n", " | return the index of the leaf x ends up in.\n", " | \n", " | decision_path(self, X)\n", " | Return the decision path in the forest.\n", " | \n", " | .. versionadded:: 0.18\n", " | \n", " | Parameters\n", " | ----------\n", " | X : {array-like, sparse matrix} of shape (n_samples, n_features)\n", " | The input samples. Internally, its dtype will be converted to\n", " | ``dtype=np.float32``. If a sparse matrix is provided, it will be\n", " | converted into a sparse ``csr_matrix``.\n", " | \n", " | Returns\n", " | -------\n", " | indicator : sparse matrix of shape (n_samples, n_nodes)\n", " | Return a node indicator matrix where non zero elements indicates\n", " | that the samples goes through the nodes. The matrix is of CSR\n", " | format.\n", " | \n", " | n_nodes_ptr : ndarray of shape (n_estimators + 1,)\n", " | The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]\n", " | gives the indicator value for the i-th estimator.\n", " | \n", " | fit(self, X, y, sample_weight=None)\n", " | Build a forest of trees from the training set (X, y).\n", " | \n", " | Parameters\n", " | ----------\n", " | X : {array-like, sparse matrix} of shape (n_samples, n_features)\n", " | The training input samples. Internally, its dtype will be converted\n", " | to ``dtype=np.float32``. If a sparse matrix is provided, it will be\n", " | converted into a sparse ``csc_matrix``.\n", " | \n", " | y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n", " | The target values (class labels in classification, real numbers in\n", " | regression).\n", " | \n", " | sample_weight : array-like of shape (n_samples,), default=None\n", " | Sample weights. If None, then samples are equally weighted. Splits\n", " | that would create child nodes with net zero or negative weight are\n", " | ignored while searching for a split in each node. In the case of\n", " | classification, splits are also ignored if they would result in any\n", " | single class carrying a negative weight in either child node.\n", " | \n", " | Returns\n", " | -------\n", " | self : object\n", " | \n", " | ----------------------------------------------------------------------\n", " | Readonly properties inherited from BaseForest:\n", " | \n", " | feature_importances_\n", " | The impurity-based feature importances.\n", " | \n", " | The higher, the more important the feature.\n", " | The importance of a feature is computed as the (normalized)\n", " | total reduction of the criterion brought by that feature. It is also\n", " | known as the Gini importance.\n", " | \n", " | Warning: impurity-based feature importances can be misleading for\n", " | high cardinality features (many unique values). See\n", " | :func:`sklearn.inspection.permutation_importance` as an alternative.\n", " | \n", " | Returns\n", " | -------\n", " | feature_importances_ : ndarray of shape (n_features,)\n", " | The values of this array sum to 1, unless all trees are single node\n", " | trees consisting of only the root node, in which case it will be an\n", " | array of zeros.\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from sklearn.ensemble._base.BaseEnsemble:\n", " | \n", " | __getitem__(self, index)\n", " | Return the index'th estimator in the ensemble.\n", " | \n", " | __iter__(self)\n", " | Return iterator over estimators in the ensemble.\n", " | \n", " | __len__(self)\n", " | Return the number of estimators in the ensemble.\n", " | \n", " | ----------------------------------------------------------------------\n", " | Data and other attributes inherited from sklearn.ensemble._base.BaseEnsemble:\n", " | \n", " | __annotations__ = {'_required_parameters': typing.List[str]}\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from sklearn.base.BaseEstimator:\n", " | \n", " | __getstate__(self)\n", " | \n", " | __repr__(self, N_CHAR_MAX=700)\n", " | Return repr(self).\n", " | \n", " | __setstate__(self, state)\n", " | \n", " | get_params(self, deep=True)\n", " | Get parameters for this estimator.\n", " | \n", " | Parameters\n", " | ----------\n", " | deep : bool, default=True\n", " | If True, will return the parameters for this estimator and\n", " | contained subobjects that are estimators.\n", " | \n", " | Returns\n", " | -------\n", " | params : dict\n", " | Parameter names mapped to their values.\n", " | \n", " | set_params(self, **params)\n", " | Set the parameters of this estimator.\n", " | \n", " | The method works on simple estimators as well as on nested objects\n", " | (such as :class:`~sklearn.pipeline.Pipeline`). The latter have\n", " | parameters of the form ``__`` so that it's\n", " | possible to update each component of a nested object.\n", " | \n", " | Parameters\n", " | ----------\n", " | **params : dict\n", " | Estimator parameters.\n", " | \n", " | Returns\n", " | -------\n", " | self : estimator instance\n", " | Estimator instance.\n", "\n" ] } ], "source": [ "help(RandomForestClassifier())" ] }, { "cell_type": "code", "execution_count": 21, "id": "9ab979e4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 3 folds for each of 100 candidates, totalling 300 fits\n", "{'n_estimators': 1600, 'max_features': 'sqrt', 'max_depth': 460}\n" ] } ], "source": [ "import numpy as np\n", "from sklearn.model_selection import RandomizedSearchCV\n", "\n", "#no of tress in random forest\n", "n_estimators=[int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n", "\n", "#max_features is no of features at every split\n", "max_features=['auto','sqrt']\n", "\n", "#max_depth(no of levels of the tree)\n", "max_depth=[int(x) for x in np.linspace(100,500,num=11)]\n", "\n", "random_grid={\"n_estimators\":n_estimators,\"max_features\":max_features,\"max_depth\":max_depth}\n", "rfc=RandomizedSearchCV(estimator=model,param_distributions=random_grid,n_iter=100,cv=3,random_state=42,verbose=2,n_jobs=-1)\n", "rfc.fit(x_train,y_train)\n", "print(rfc.best_params_)" ] }, { "cell_type": "code", "execution_count": 18, "id": "a50a880b", "metadata": { "collapsed": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on class RandomizedSearchCV in module sklearn.model_selection._search:\n", "\n", "class RandomizedSearchCV(BaseSearchCV)\n", " | RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=nan, return_train_score=False)\n", " | \n", " | Randomized search on hyper parameters.\n", " | \n", " | RandomizedSearchCV implements a \"fit\" and a \"score\" method.\n", " | It also implements \"score_samples\", \"predict\", \"predict_proba\",\n", " | \"decision_function\", \"transform\" and \"inverse_transform\" if they are\n", " | implemented in the estimator used.\n", " | \n", " | The parameters of the estimator used to apply these methods are optimized\n", " | by cross-validated search over parameter settings.\n", " | \n", " | In contrast to GridSearchCV, not all parameter values are tried out, but\n", " | rather a fixed number of parameter settings is sampled from the specified\n", " | distributions. The number of parameter settings that are tried is\n", " | given by n_iter.\n", " | \n", " | If all parameters are presented as a list,\n", " | sampling without replacement is performed. If at least one parameter\n", " | is given as a distribution, sampling with replacement is used.\n", " | It is highly recommended to use continuous distributions for continuous\n", " | parameters.\n", " | \n", " | Read more in the :ref:`User Guide `.\n", " | \n", " | .. versionadded:: 0.14\n", " | \n", " | Parameters\n", " | ----------\n", " | estimator : estimator object.\n", " | A object of that type is instantiated for each grid point.\n", " | This is assumed to implement the scikit-learn estimator interface.\n", " | Either estimator needs to provide a ``score`` function,\n", " | or ``scoring`` must be passed.\n", " | \n", " | param_distributions : dict or list of dicts\n", " | Dictionary with parameters names (`str`) as keys and distributions\n", " | or lists of parameters to try. Distributions must provide a ``rvs``\n", " | method for sampling (such as those from scipy.stats.distributions).\n", " | If a list is given, it is sampled uniformly.\n", " | If a list of dicts is given, first a dict is sampled uniformly, and\n", " | then a parameter is sampled using that dict as above.\n", " | \n", " | n_iter : int, default=10\n", " | Number of parameter settings that are sampled. n_iter trades\n", " | off runtime vs quality of the solution.\n", " | \n", " | scoring : str, callable, list/tuple or dict, default=None\n", " | A single str (see :ref:`scoring_parameter`) or a callable\n", " | (see :ref:`scoring`) to evaluate the predictions on the test set.\n", " | \n", " | For evaluating multiple metrics, either give a list of (unique) strings\n", " | or a dict with names as keys and callables as values.\n", " | \n", " | NOTE that when using custom scorers, each scorer should return a single\n", " | value. Metric functions returning a list/array of values can be wrapped\n", " | into multiple scorers that return one value each.\n", " | \n", " | See :ref:`multimetric_grid_search` for an example.\n", " | \n", " | If None, the estimator's score method is used.\n", " | \n", " | n_jobs : int, default=None\n", " | Number of jobs to run in parallel.\n", " | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n", " | ``-1`` means using all processors. See :term:`Glossary `\n", " | for more details.\n", " | \n", " | .. versionchanged:: v0.20\n", " | `n_jobs` default changed from 1 to None\n", " | \n", " | pre_dispatch : int, or str, default=None\n", " | Controls the number of jobs that get dispatched during parallel\n", " | execution. Reducing this number can be useful to avoid an\n", " | explosion of memory consumption when more jobs get dispatched\n", " | than CPUs can process. This parameter can be:\n", " | \n", " | - None, in which case all the jobs are immediately\n", " | created and spawned. Use this for lightweight and\n", " | fast-running jobs, to avoid delays due to on-demand\n", " | spawning of the jobs\n", " | \n", " | - An int, giving the exact number of total jobs that are\n", " | spawned\n", " | \n", " | - A str, giving an expression as a function of n_jobs,\n", " | as in '2*n_jobs'\n", " | \n", " | cv : int, cross-validation generator or an iterable, default=None\n", " | Determines the cross-validation splitting strategy.\n", " | Possible inputs for cv are:\n", " | \n", " | - None, to use the default 5-fold cross validation,\n", " | - integer, to specify the number of folds in a `(Stratified)KFold`,\n", " | - :term:`CV splitter`,\n", " | - An iterable yielding (train, test) splits as arrays of indices.\n", " | \n", " | For integer/None inputs, if the estimator is a classifier and ``y`` is\n", " | either binary or multiclass, :class:`StratifiedKFold` is used. In all\n", " | other cases, :class:`KFold` is used.\n", " | \n", " | Refer :ref:`User Guide ` for the various\n", " | cross-validation strategies that can be used here.\n", " | \n", " | .. versionchanged:: 0.22\n", " | ``cv`` default value if None changed from 3-fold to 5-fold.\n", " | \n", " | refit : bool, str, or callable, default=True\n", " | Refit an estimator using the best found parameters on the whole\n", " | dataset.\n", " | \n", " | For multiple metric evaluation, this needs to be a `str` denoting the\n", " | scorer that would be used to find the best parameters for refitting\n", " | the estimator at the end.\n", " | \n", " | Where there are considerations other than maximum score in\n", " | choosing a best estimator, ``refit`` can be set to a function which\n", " | returns the selected ``best_index_`` given the ``cv_results``. In that\n", " | case, the ``best_estimator_`` and ``best_params_`` will be set\n", " | according to the returned ``best_index_`` while the ``best_score_``\n", " | attribute will not be available.\n", " | \n", " | The refitted estimator is made available at the ``best_estimator_``\n", " | attribute and permits using ``predict`` directly on this\n", " | ``RandomizedSearchCV`` instance.\n", " | \n", " | Also for multiple metric evaluation, the attributes ``best_index_``,\n", " | ``best_score_`` and ``best_params_`` will only be available if\n", " | ``refit`` is set and all of them will be determined w.r.t this specific\n", " | scorer.\n", " | \n", " | See ``scoring`` parameter to know more about multiple metric\n", " | evaluation.\n", " | \n", " | .. versionchanged:: 0.20\n", " | Support for callable added.\n", " | \n", " | verbose : int\n", " | Controls the verbosity: the higher, the more messages.\n", " | \n", " | random_state : int, RandomState instance or None, default=None\n", " | Pseudo random number generator state used for random uniform sampling\n", " | from lists of possible values instead of scipy.stats distributions.\n", " | Pass an int for reproducible output across multiple\n", " | function calls.\n", " | See :term:`Glossary `.\n", " | \n", " | error_score : 'raise' or numeric, default=np.nan\n", " | Value to assign to the score if an error occurs in estimator fitting.\n", " | If set to 'raise', the error is raised. If a numeric value is given,\n", " | FitFailedWarning is raised. This parameter does not affect the refit\n", " | step, which will always raise the error.\n", " | \n", " | return_train_score : bool, default=False\n", " | If ``False``, the ``cv_results_`` attribute will not include training\n", " | scores.\n", " | Computing training scores is used to get insights on how different\n", " | parameter settings impact the overfitting/underfitting trade-off.\n", " | However computing the scores on the training set can be computationally\n", " | expensive and is not strictly required to select the parameters that\n", " | yield the best generalization performance.\n", " | \n", " | .. versionadded:: 0.19\n", " | \n", " | .. versionchanged:: 0.21\n", " | Default value was changed from ``True`` to ``False``\n", " | \n", " | Attributes\n", " | ----------\n", " | cv_results_ : dict of numpy (masked) ndarrays\n", " | A dict with keys as column headers and values as columns, that can be\n", " | imported into a pandas ``DataFrame``.\n", " | \n", " | For instance the below given table\n", " | \n", " | +--------------+-------------+-------------------+---+---------------+\n", " | | param_kernel | param_gamma | split0_test_score |...|rank_test_score|\n", " | +==============+=============+===================+===+===============+\n", " | | 'rbf' | 0.1 | 0.80 |...| 1 |\n", " | +--------------+-------------+-------------------+---+---------------+\n", " | | 'rbf' | 0.2 | 0.84 |...| 3 |\n", " | +--------------+-------------+-------------------+---+---------------+\n", " | | 'rbf' | 0.3 | 0.70 |...| 2 |\n", " | +--------------+-------------+-------------------+---+---------------+\n", " | \n", " | will be represented by a ``cv_results_`` dict of::\n", " | \n", " | {\n", " | 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'],\n", " | mask = False),\n", " | 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False),\n", " | 'split0_test_score' : [0.80, 0.84, 0.70],\n", " | 'split1_test_score' : [0.82, 0.50, 0.70],\n", " | 'mean_test_score' : [0.81, 0.67, 0.70],\n", " | 'std_test_score' : [0.01, 0.24, 0.00],\n", " | 'rank_test_score' : [1, 3, 2],\n", " | 'split0_train_score' : [0.80, 0.92, 0.70],\n", " | 'split1_train_score' : [0.82, 0.55, 0.70],\n", " | 'mean_train_score' : [0.81, 0.74, 0.70],\n", " | 'std_train_score' : [0.01, 0.19, 0.00],\n", " | 'mean_fit_time' : [0.73, 0.63, 0.43],\n", " | 'std_fit_time' : [0.01, 0.02, 0.01],\n", " | 'mean_score_time' : [0.01, 0.06, 0.04],\n", " | 'std_score_time' : [0.00, 0.00, 0.00],\n", " | 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],\n", " | }\n", " | \n", " | NOTE\n", " | \n", " | The key ``'params'`` is used to store a list of parameter\n", " | settings dicts for all the parameter candidates.\n", " | \n", " | The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and\n", " | ``std_score_time`` are all in seconds.\n", " | \n", " | For multi-metric evaluation, the scores for all the scorers are\n", " | available in the ``cv_results_`` dict at the keys ending with that\n", " | scorer's name (``'_'``) instead of ``'_score'`` shown\n", " | above. ('split0_test_precision', 'mean_train_precision' etc.)\n", " | \n", " | best_estimator_ : estimator\n", " | Estimator that was chosen by the search, i.e. estimator\n", " | which gave highest score (or smallest loss if specified)\n", " | on the left out data. Not available if ``refit=False``.\n", " | \n", " | For multi-metric evaluation, this attribute is present only if\n", " | ``refit`` is specified.\n", " | \n", " | See ``refit`` parameter for more information on allowed values.\n", " | \n", " | best_score_ : float\n", " | Mean cross-validated score of the best_estimator.\n", " | \n", " | For multi-metric evaluation, this is not available if ``refit`` is\n", " | ``False``. See ``refit`` parameter for more information.\n", " | \n", " | This attribute is not available if ``refit`` is a function.\n", " | \n", " | best_params_ : dict\n", " | Parameter setting that gave the best results on the hold out data.\n", " | \n", " | For multi-metric evaluation, this is not available if ``refit`` is\n", " | ``False``. See ``refit`` parameter for more information.\n", " | \n", " | best_index_ : int\n", " | The index (of the ``cv_results_`` arrays) which corresponds to the best\n", " | candidate parameter setting.\n", " | \n", " | The dict at ``search.cv_results_['params'][search.best_index_]`` gives\n", " | the parameter setting for the best model, that gives the highest\n", " | mean score (``search.best_score_``).\n", " | \n", " | For multi-metric evaluation, this is not available if ``refit`` is\n", " | ``False``. See ``refit`` parameter for more information.\n", " | \n", " | scorer_ : function or a dict\n", " | Scorer function used on the held out data to choose the best\n", " | parameters for the model.\n", " | \n", " | For multi-metric evaluation, this attribute holds the validated\n", " | ``scoring`` dict which maps the scorer key to the scorer callable.\n", " | \n", " | n_splits_ : int\n", " | The number of cross-validation splits (folds/iterations).\n", " | \n", " | refit_time_ : float\n", " | Seconds used for refitting the best model on the whole dataset.\n", " | \n", " | This is present only if ``refit`` is not False.\n", " | \n", " | .. versionadded:: 0.20\n", " | \n", " | multimetric_ : bool\n", " | Whether or not the scorers compute several metrics.\n", " | \n", " | Notes\n", " | -----\n", " | The parameters selected are those that maximize the score of the held-out\n", " | data, according to the scoring parameter.\n", " | \n", " | If `n_jobs` was set to a value higher than one, the data is copied for each\n", " | parameter setting(and not `n_jobs` times). This is done for efficiency\n", " | reasons if individual jobs take very little time, but may raise errors if\n", " | the dataset is large and not enough memory is available. A workaround in\n", " | this case is to set `pre_dispatch`. Then, the memory is copied only\n", " | `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *\n", " | n_jobs`.\n", " | \n", " | See Also\n", " | --------\n", " | GridSearchCV : Does exhaustive search over a grid of parameters.\n", " | ParameterSampler : A generator over parameter settings, constructed from\n", " | param_distributions.\n", " | \n", " | Examples\n", " | --------\n", " | >>> from sklearn.datasets import load_iris\n", " | >>> from sklearn.linear_model import LogisticRegression\n", " | >>> from sklearn.model_selection import RandomizedSearchCV\n", " | >>> from scipy.stats import uniform\n", " | >>> iris = load_iris()\n", " | >>> logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,\n", " | ... random_state=0)\n", " | >>> distributions = dict(C=uniform(loc=0, scale=4),\n", " | ... penalty=['l2', 'l1'])\n", " | >>> clf = RandomizedSearchCV(logistic, distributions, random_state=0)\n", " | >>> search = clf.fit(iris.data, iris.target)\n", " | >>> search.best_params_\n", " | {'C': 2..., 'penalty': 'l1'}\n", " | \n", " | Method resolution order:\n", " | RandomizedSearchCV\n", " | BaseSearchCV\n", " | sklearn.base.MetaEstimatorMixin\n", " | sklearn.base.BaseEstimator\n", " | builtins.object\n", " | \n", " | Methods defined here:\n", " | \n", " | __init__(self, estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=nan, return_train_score=False)\n", " | Initialize self. See help(type(self)) for accurate signature.\n", " | \n", " | ----------------------------------------------------------------------\n", " | Data and other attributes defined here:\n", " | \n", " | __abstractmethods__ = frozenset()\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from BaseSearchCV:\n", " | \n", " | decision_function(self, X)\n", " | Call decision_function on the estimator with the best found parameters.\n", " | \n", " | Only available if ``refit=True`` and the underlying estimator supports\n", " | ``decision_function``.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : indexable, length n_samples\n", " | Must fulfill the input assumptions of the\n", " | underlying estimator.\n", " | \n", " | fit(self, X, y=None, *, groups=None, **fit_params)\n", " | Run fit with all sets of parameters.\n", " | \n", " | Parameters\n", " | ----------\n", " | \n", " | X : array-like of shape (n_samples, n_features)\n", " | Training vector, where n_samples is the number of samples and\n", " | n_features is the number of features.\n", " | \n", " | y : array-like of shape (n_samples, n_output) or (n_samples,), default=None\n", " | Target relative to X for classification or regression;\n", " | None for unsupervised learning.\n", " | \n", " | groups : array-like of shape (n_samples,), default=None\n", " | Group labels for the samples used while splitting the dataset into\n", " | train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n", " | instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n", " | \n", " | **fit_params : dict of str -> object\n", " | Parameters passed to the ``fit`` method of the estimator\n", " | \n", " | inverse_transform(self, Xt)\n", " | Call inverse_transform on the estimator with the best found params.\n", " | \n", " | Only available if the underlying estimator implements\n", " | ``inverse_transform`` and ``refit=True``.\n", " | \n", " | Parameters\n", " | ----------\n", " | Xt : indexable, length n_samples\n", " | Must fulfill the input assumptions of the\n", " | underlying estimator.\n", " | \n", " | predict(self, X)\n", " | Call predict on the estimator with the best found parameters.\n", " | \n", " | Only available if ``refit=True`` and the underlying estimator supports\n", " | ``predict``.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : indexable, length n_samples\n", " | Must fulfill the input assumptions of the\n", " | underlying estimator.\n", " | \n", " | predict_log_proba(self, X)\n", " | Call predict_log_proba on the estimator with the best found parameters.\n", " | \n", " | Only available if ``refit=True`` and the underlying estimator supports\n", " | ``predict_log_proba``.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : indexable, length n_samples\n", " | Must fulfill the input assumptions of the\n", " | underlying estimator.\n", " | \n", " | predict_proba(self, X)\n", " | Call predict_proba on the estimator with the best found parameters.\n", " | \n", " | Only available if ``refit=True`` and the underlying estimator supports\n", " | ``predict_proba``.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : indexable, length n_samples\n", " | Must fulfill the input assumptions of the\n", " | underlying estimator.\n", " | \n", " | score(self, X, y=None)\n", " | Returns the score on the given data, if the estimator has been refit.\n", " | \n", " | This uses the score defined by ``scoring`` where provided, and the\n", " | ``best_estimator_.score`` method otherwise.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : array-like of shape (n_samples, n_features)\n", " | Input data, where n_samples is the number of samples and\n", " | n_features is the number of features.\n", " | \n", " | y : array-like of shape (n_samples, n_output) or (n_samples,), default=None\n", " | Target relative to X for classification or regression;\n", " | None for unsupervised learning.\n", " | \n", " | Returns\n", " | -------\n", " | score : float\n", " | \n", " | score_samples(self, X)\n", " | Call score_samples on the estimator with the best found parameters.\n", " | \n", " | Only available if ``refit=True`` and the underlying estimator supports\n", " | ``score_samples``.\n", " | \n", " | .. versionadded:: 0.24\n", " | \n", " | Parameters\n", " | ----------\n", " | X : iterable\n", " | Data to predict on. Must fulfill input requirements\n", " | of the underlying estimator.\n", " | \n", " | Returns\n", " | -------\n", " | y_score : ndarray of shape (n_samples,)\n", " | \n", " | transform(self, X)\n", " | Call transform on the estimator with the best found parameters.\n", " | \n", " | Only available if the underlying estimator supports ``transform`` and\n", " | ``refit=True``.\n", " | \n", " | Parameters\n", " | ----------\n", " | X : indexable, length n_samples\n", " | Must fulfill the input assumptions of the\n", " | underlying estimator.\n", " | \n", " | ----------------------------------------------------------------------\n", " | Readonly properties inherited from BaseSearchCV:\n", " | \n", " | classes_\n", " | \n", " | n_features_in_\n", " | \n", " | ----------------------------------------------------------------------\n", " | Data descriptors inherited from sklearn.base.MetaEstimatorMixin:\n", " | \n", " | __dict__\n", " | dictionary for instance variables (if defined)\n", " | \n", " | __weakref__\n", " | list of weak references to the object (if defined)\n", " | \n", " | ----------------------------------------------------------------------\n", " | Methods inherited from sklearn.base.BaseEstimator:\n", " | \n", " | __getstate__(self)\n", " | \n", " | __repr__(self, N_CHAR_MAX=700)\n", " | Return repr(self).\n", " | \n", " | __setstate__(self, state)\n", " | \n", " | get_params(self, deep=True)\n", " | Get parameters for this estimator.\n", " | \n", " | Parameters\n", " | ----------\n", " | deep : bool, default=True\n", " | If True, will return the parameters for this estimator and\n", " | contained subobjects that are estimators.\n", " | \n", " | Returns\n", " | -------\n", " | params : dict\n", " | Parameter names mapped to their values.\n", " | \n", " | set_params(self, **params)\n", " | Set the parameters of this estimator.\n", " | \n", " | The method works on simple estimators as well as on nested objects\n", " | (such as :class:`~sklearn.pipeline.Pipeline`). The latter have\n", " | parameters of the form ``__`` so that it's\n", " | possible to update each component of a nested object.\n", " | \n", " | Parameters\n", " | ----------\n", " | **params : dict\n", " | Estimator parameters.\n", " | \n", " | Returns\n", " | -------\n", " | self : estimator instance\n", " | Estimator instance.\n", "\n" ] } ], "source": [ "help(RandomizedSearchCV)" ] }, { "cell_type": "code", "execution_count": 7, "id": "53b81163", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier(max_depth=460, max_features='sqrt', n_estimators=1600)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model=RandomForestClassifier(n_estimators= 1600, max_features='sqrt', max_depth= 460)\n", "model.fit(x_train,y_train)" ] }, { "cell_type": "code", "execution_count": 8, "id": "dbaa5137", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import cross_val_score\n", "from sklearn.metrics import accuracy_score,confusion_matrix\n", "y_pred=model.predict(x_test)\n", "cv_score=cross_val_score(model,x,y,cv=10)\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "c04a97fe", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "========Accuracy=========\n", "0.8131868131868132\n", "========confusion_matrix=========\n", "[[32 9]\n", " [ 8 42]]\n", "========All accuracy scores=========\n", "[0.90322581 0.80645161 0.83870968 0.9 0.9 0.83333333\n", " 0.73333333 0.83333333 0.73333333 0.8 ]\n", "========mean accuracy score=========\n", "0.8281720430107526\n" ] } ], "source": [ "print(\"========Accuracy=========\")\n", "print(accuracy_score(y_test,y_pred))\n", "print(\"========confusion_matrix=========\")\n", "print(confusion_matrix(y_test,y_pred))\n", "print(\"========All accuracy scores=========\")\n", "print(cv_score)\n", "print(\"========mean accuracy score=========\")\n", "print(cv_score.mean())" ] }, { "cell_type": "markdown", "id": "a449ca04", "metadata": {}, "source": [ "***Random Forest Regressor***\n", "- it predict the values by taking the average of all output of all trees" ] }, { "cell_type": "code", "execution_count": 14, "id": "0640458e", "metadata": { "collapsed": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeAnnual SalaryWeekly hoursEducation
072160000.040.0Bachelor's degree or higher
172100000.050.0Bachelor's degree or higher
231120000.040.0Bachelor's degree or higher
32845000.040.0Bachelor's degree or higher
45485000.040.0Bachelor's degree or higher
...............
4952747000.040.0Bachelor's degree or higher
49653132000.070.0Bachelor's degree or higher
4975110100.020.0Bachelor's degree or higher
4983257000.035.0Bachelor's degree or higher
4991818700.020.0Attended college, no degree
\n", "

500 rows × 4 columns

\n", "
" ], "text/plain": [ " Age Annual Salary Weekly hours Education\n", "0 72 160000.0 40.0 Bachelor's degree or higher\n", "1 72 100000.0 50.0 Bachelor's degree or higher\n", "2 31 120000.0 40.0 Bachelor's degree or higher\n", "3 28 45000.0 40.0 Bachelor's degree or higher\n", "4 54 85000.0 40.0 Bachelor's degree or higher\n", ".. ... ... ... ...\n", "495 27 47000.0 40.0 Bachelor's degree or higher\n", "496 53 132000.0 70.0 Bachelor's degree or higher\n", "497 51 10100.0 20.0 Bachelor's degree or higher\n", "498 32 57000.0 35.0 Bachelor's degree or higher\n", "499 18 18700.0 20.0 Attended college, no degree\n", "\n", "[500 rows x 4 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df=pd.read_csv(\"https://raw.githubusercontent.com/AP-State-Skill-Development-Corporation/Datasets/master/Regression/age_salary_hours.csv\")\n", "df" ] }, { "cell_type": "code", "execution_count": 18, "id": "e9821140", "metadata": { "collapsed": true }, "outputs": [ { "data": { "text/plain": [ "array([[40.],\n", " [50.],\n", " [40.],\n", " [40.],\n", " [40.],\n", " [40.],\n", " [ 0.],\n", " [40.],\n", " [ 0.],\n", " [50.],\n", " [50.],\n", " [ 0.],\n", " [ 0.],\n", " [40.],\n", " [45.],\n", " [27.],\n", " [40.],\n", " [ 0.],\n", " [55.],\n", " [50.],\n", " [40.],\n", " [40.],\n", " [40.],\n", " [45.],\n", " [40.],\n", " [40.],\n", " [ 0.],\n", " [36.],\n", " [40.],\n", " [40.],\n", " [29.],\n", " [40.],\n", " [35.],\n", " [ 0.],\n", " [ 0.],\n", " [40.],\n", " [40.],\n", " [30.],\n", " [35.],\n", " [ 0.],\n", " [50.],\n", " [13.],\n", " [40.],\n", " [40.],\n", " [ 0.],\n", " [ 0.],\n", " [40.],\n", " [43.],\n", " [40.],\n", " [60.],\n", " [ 0.],\n", " [ 0.],\n", " [20.],\n", " [35.],\n", " [30.],\n", " [45.],\n", " [10.],\n", " [ 0.],\n", " [40.],\n", " [50.],\n", " [50.],\n", " [ 0.],\n", " [ 0.],\n", " [ 0.],\n", " [39.],\n", " [50.],\n", " [40.],\n", " [ 0.],\n", " [45.],\n", " [ 0.],\n", " [ 0.],\n", " [50.],\n", " [25.],\n", " [ 0.],\n", " [ 0.],\n", " [40.],\n", " [ 0.],\n", " [20.],\n", " [ 0.],\n", " [40.],\n", " [10.],\n", " [50.],\n", " [18.],\n", " [40.],\n", " [50.],\n", " [ 0.],\n", " [ 0.],\n", " [20.],\n", " [17.],\n", " [ 0.],\n", " [ 0.],\n", " [10.],\n", " [70.],\n", " [ 0.],\n", " [ 0.],\n", " [50.],\n", " [ 0.],\n", " [60.],\n", " [ 0.],\n", " [20.],\n", " [50.],\n", " [15.],\n", " [ 0.],\n", " [40.],\n", " [45.],\n", " [38.],\n", " [50.],\n", " [40.],\n", " [17.],\n", " [15.],\n", " [45.],\n", " [ 0.],\n", " [ 0.],\n", " [20.],\n", " [40.],\n", " [ 0.],\n", " [35.],\n", " [60.],\n", " [ 0.],\n", " [18.],\n", " [35.],\n", " [ 0.],\n", " [40.],\n", " [ 0.],\n", " [ 0.],\n", " [50.],\n", " [ 0.],\n", " [32.],\n", " [15.],\n", " [20.],\n", " [ 8.],\n", " [50.],\n", " [50.],\n", " [25.],\n", " [40.],\n", " [35.],\n", " [12.],\n", " [40.],\n", " [75.],\n", " [45.],\n", " [ 0.],\n", " [ 0.],\n", " [80.],\n", " [ 0.],\n", " [40.],\n", " [ 0.],\n", " [ 0.],\n", " [50.],\n", " [ 0.],\n", " [55.],\n", " [ 0.],\n", " [40.],\n", " [ 0.],\n", " [60.],\n", " [ 0.],\n", " [40.],\n", " [40.],\n", " [ 0.],\n", " [44.],\n", " [ 8.],\n", " [40.],\n", " [20.],\n", " [40.],\n", " [45.],\n", " [40.],\n", " [42.],\n", " [40.],\n", " [40.],\n", " [38.],\n", " [40.],\n", " [ 0.],\n", " [ 0.],\n", " [ 0.],\n", " [ 0.],\n", " [40.],\n", " [30.],\n", " [40.],\n", " [35.],\n", " [ 0.],\n", " [40.],\n", " [44.],\n", " [60.],\n", " [60.],\n", " [50.],\n", " [40.],\n", " [40.],\n", " [40.],\n", " [26.],\n", " [38.],\n", " [ 0.],\n", " [16.],\n", " [40.],\n", " [35.],\n", " [35.],\n", " [50.],\n", " [20.],\n", " [60.],\n", " [40.],\n", " [25.],\n", " [ 0.],\n", " [50.],\n", " [20.],\n", " [30.],\n", " [40.],\n", " [40.],\n", " [ 0.],\n", " [50.],\n", " [ 0.],\n", " [25.],\n", " [40.],\n", " [36.],\n", " [50.],\n", " [40.],\n", " [30.],\n", " [ 0.],\n", " [ 0.],\n", " [ 3.],\n", " [ 0.],\n", " [45.],\n", " [40.],\n", " [ 0.],\n", " [ 3.],\n", " [ 0.],\n", " [50.],\n", " [40.],\n", " [37.],\n", " [36.],\n", " [ 0.],\n", " [ 0.],\n", " [ 0.],\n", " [ 2.],\n", " [40.],\n", " [ 0.],\n", " [40.],\n", " [ 0.],\n", " [50.],\n", " [50.],\n", " [50.],\n", " [40.],\n", " [40.],\n", " [40.],\n", " [40.],\n", " [40.],\n", " [ 0.],\n", " [18.],\n", " [ 0.],\n", " [10.],\n", " [ 0.],\n", " [ 0.],\n", " [40.],\n", " [50.],\n", " [40.],\n", " [ 0.],\n", " [ 0.],\n", " [38.],\n", " [40.],\n", " [ 0.],\n", " [35.],\n", " [ 0.],\n", " [40.],\n", " [30.],\n", " [40.],\n", " [ 0.],\n", " [40.],\n", " [ 0.],\n", " [20.],\n", " [50.],\n", " [30.],\n", " [40.],\n", " [40.],\n", " [ 0.],\n", " [ 0.],\n", " [ 0.],\n", " [ 0.],\n", " [40.],\n", " [24.],\n", " [ 0.],\n", " [40.],\n", " [60.],\n", " [50.],\n", " [ 0.],\n", " [36.],\n", " [40.],\n", " [40.],\n", " [45.],\n", " [40.],\n", " [40.],\n", " [ 0.],\n", " [36.],\n", " [50.],\n", " [38.],\n", " [35.],\n", " [40.],\n", " [36.],\n", " [90.],\n", " [ 0.],\n", " [40.],\n", " [40.],\n", " [45.],\n", " [ 0.],\n", " [40.],\n", " [40.],\n", " [40.],\n", " [ 0.],\n", " [35.],\n", " [ 0.],\n", " [50.],\n", " [ 0.],\n", " [37.],\n", " [40.],\n", " [70.],\n", " [45.],\n", " [ 0.],\n", " [40.],\n", " [ 0.],\n", " [ 0.],\n", " [20.],\n", " [24.],\n", " [40.],\n", " [35.],\n", " [30.],\n", " [40.],\n", " [50.],\n", " [ 4.],\n", " [40.],\n", " [40.],\n", " [ 0.],\n", " [45.],\n", " [40.],\n", " [ 0.],\n", " [ 0.],\n", " [50.],\n", " [40.],\n", " [50.],\n", " [40.],\n", " [ 0.],\n", " [50.],\n", " [35.],\n", " [40.],\n", " [50.],\n", " [45.],\n", " [ 0.],\n", " [ 0.],\n", " [40.],\n", " [ 0.],\n", " [40.],\n", " [40.],\n", " [40.],\n", " [ 0.],\n", " [43.],\n", " [20.],\n", " [40.],\n", " [40.],\n", " [30.],\n", " [25.],\n", " [40.],\n", " [35.],\n", " [35.],\n", " [ 0.],\n", " [35.],\n", " [30.],\n", " [ 0.],\n", " [40.],\n", " [35.],\n", " [40.],\n", " [35.],\n", " [36.],\n", " [20.],\n", " [55.],\n", " [40.],\n", " [70.],\n", " [40.],\n", " [37.],\n", " [40.],\n", " [35.],\n", " [40.],\n", " [50.],\n", " [52.],\n", " [40.],\n", " [ 0.],\n", " [ 0.],\n", " [42.],\n", " [30.],\n", " [45.],\n", " [ 2.],\n", " [20.],\n", " [50.],\n", " [48.],\n", " [40.],\n", " [20.],\n", " [ 0.],\n", " [40.],\n", " [40.],\n", " [41.],\n", " [50.],\n", " [40.],\n", " [50.],\n", " [50.],\n", " [24.],\n", " [50.],\n", " [ 0.],\n", " [13.],\n", " [40.],\n", " [ 0.],\n", " [ 0.],\n", " [30.],\n", " [35.],\n", " [40.],\n", " [ 0.],\n", " [40.],\n", " [70.],\n", " [30.],\n", " [ 5.],\n", " [ 0.],\n", " [ 0.],\n", " [ 0.],\n", " [30.],\n", " [25.],\n", " [ 0.],\n", " [50.],\n", " [ 0.],\n", " [ 0.],\n", " [40.],\n", " [ 0.],\n", " [ 0.],\n", " [40.],\n", " [40.],\n", " [ 0.],\n", " [ 0.],\n", " [40.],\n", " [40.],\n", " [50.],\n", " [ 0.],\n", " [60.],\n", " [45.],\n", " [42.],\n", " [35.],\n", " [15.],\n", " [ 0.],\n", " [28.],\n", " [ 0.],\n", " [16.],\n", " [40.],\n", " [25.],\n", " [ 5.],\n", " [40.],\n", " [38.],\n", " [99.],\n", " [40.],\n", " [ 0.],\n", " [ 0.],\n", " [40.],\n", " [40.],\n", " [ 0.],\n", " [38.],\n", " [40.],\n", " [45.],\n", " [55.],\n", " [40.],\n", " [40.],\n", " [50.],\n", " [55.],\n", " [50.],\n", " [40.],\n", " [ 0.],\n", " [35.],\n", " [40.],\n", " [ 0.],\n", " [40.],\n", " [40.],\n", " [50.],\n", " [40.],\n", " [40.],\n", " [35.],\n", " [50.],\n", " [60.],\n", " [40.],\n", " [40.],\n", " [40.],\n", " [45.],\n", " [40.],\n", " [ 0.],\n", " [40.],\n", " [10.],\n", " [60.],\n", " [ 0.],\n", " [35.],\n", " [45.],\n", " [30.],\n", " [38.],\n", " [38.],\n", " [40.],\n", " [50.],\n", " [ 0.],\n", " [32.],\n", " [40.],\n", " [70.],\n", " [20.],\n", " [35.],\n", " [20.]])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x=df[\"Weekly hours\"].values.reshape(-1,1)\n", "x" ] }, { "cell_type": "code", "execution_count": 22, "id": "1fef12be", "metadata": {}, "outputs": [], "source": [ "y=df[\"Annual Salary\"].values.reshape(-1,1)" ] }, { "cell_type": "code", "execution_count": 23, "id": "efb05625", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":3: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", " model.fit(x,y)\n" ] }, { "data": { "text/plain": [ "RandomForestRegressor()" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "model=RandomForestRegressor()\n", "model.fit(x,y)" ] }, { "cell_type": "code", "execution_count": 24, "id": "7fbd70f5", "metadata": {}, "outputs": [], "source": [ "y_pred=model.predict(x)" ] }, { "cell_type": "code", "execution_count": 26, "id": "c41886b0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.3518475221144928" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.score(y,y_pred)*100" ] }, { "cell_type": "code", "execution_count": null, "id": "9ae0e563", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }