import pandas as pd

df = pd.read_csv("phishing_url_dataset.csv")
print(df.shape)
df.head()

(2488, 14)

import os
print("Current folder:", os.getcwd())
print("Files here:", os.listdir("."))

Current folder: C:\Users\user\phishing_project
Files here: ['.ipynb_checkpoints', '.venv', 'phishing_detection.ipynb', 'phishing_url_dataset.csv']

import pandas as pd
df = pd.read_csv("phishing_url_dataset.csv")
df.head()

import pandas as pd

df = pd.read_csv("phishing_url_dataset.csv")
print(df.shape)
print(df.columns.tolist())
df.head()

(2488, 14)
['url_length', 'valid_url', 'at_symbol', 'sensitive_words_count', 'path_length', 'isHttps', 'nb_dots', 'nb_hyphens', 'nb_and', 'nb_or', 'nb_www', 'nb_com', 'nb_underscore', 'target']

for c in df.columns:
    if df[c].nunique() <= 10:
        print("\n", c)
        print(df[c].value_counts())

 valid_url
valid_url
0    1791
1     697
Name: count, dtype: int64

 at_symbol
at_symbol
0    2482
1       4
8       1
9       1
Name: count, dtype: int64

 sensitive_words_count
sensitive_words_count
0    1934
1     532
2      20
3       2
Name: count, dtype: int64

 isHttps
isHttps
0    1424
1    1064
Name: count, dtype: int64

 nb_hyphens
nb_hyphens
0     1810
1      318
2      154
4       74
3       73
5       36
6       14
7        6
10       3
Name: count, dtype: int64

 nb_and
nb_and
0    2451
1      36
2       1
Name: count, dtype: int64

 nb_or
nb_or
0    2083
1     369
2      31
3       5
Name: count, dtype: int64

 nb_www
nb_www
0    1932
1     541
2      15
Name: count, dtype: int64

 nb_com
nb_com
1    1472
0     949
2      63
3       4
Name: count, dtype: int64

 nb_underscore
nb_underscore
0    2241
1     185
2      34
3      22
4       3
6       1
5       1
7       1
Name: count, dtype: int64

 target
target
0    1313
1    1175
Name: count, dtype: int64

target_col = "valid_url"

# 1 = phishing (positive class), 0 = legitimate
df["is_phishing"] = (df[target_col] == 0).astype(int)

df["is_phishing"].value_counts()

is_phishing
1    1791
0     697
Name: count, dtype: int64

import numpy as np

# clean any inf values if present
df = df.replace([np.inf, -np.inf], np.nan)

X = df.drop(columns=[target_col, "is_phishing"], errors="ignore")
y = df["is_phishing"]

# keep only numeric columns (safe if there is a url text column)
X = X.select_dtypes(include=["number"])

print(X.shape, y.shape)
X.head()

(2488, 13) (2488,)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)

(is_phishing
 1    0.720101
 0    0.279899
 Name: proportion, dtype: float64,
 is_phishing
 1    0.718876
 0    0.281124
 Name: proportion, dtype: float64)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

baseline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=500, class_weight="balanced"))
])

baseline.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('model',
                 LogisticRegression(class_weight='balanced', max_iter=500))])

from sklearn.metrics import (confusion_matrix, classification_report, 
                             roc_auc_score, average_precision_score)

pred = baseline.predict(X_test)
proba = baseline.predict_proba(X_test)[:, 1]

print("Confusion matrix:\n", confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC-AUC:", roc_auc_score(y_test, proba))
print("PR-AUC:", average_precision_score(y_test, proba))

Confusion matrix:
 [[123  17]
 [ 60 298]]
              precision    recall  f1-score   support

           0       0.67      0.88      0.76       140
           1       0.95      0.83      0.89       358

    accuracy                           0.85       498
   macro avg       0.81      0.86      0.82       498
weighted avg       0.87      0.85      0.85       498

ROC-AUC: 0.9333998403830807
PR-AUC: 0.9710189862716602

import lightgbm as lgb

imp = SimpleImputer(strategy="median")
X_train_i = imp.fit_transform(X_train)
X_test_i  = imp.transform(X_test)

lgbm = lgb.LGBMClassifier(
    n_estimators=800,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.9,
    colsample_bytree=0.9,
    class_weight="balanced",
    random_state=42
)

lgbm.fit(X_train_i, y_train)

proba_lgb = lgbm.predict_proba(X_test_i)[:, 1]
pred_lgb = (proba_lgb >= 0.5).astype(int)

print("Confusion matrix:\n", confusion_matrix(y_test, pred_lgb))
print(classification_report(y_test, pred_lgb))
print("ROC-AUC:", roc_auc_score(y_test, proba_lgb))
print("PR-AUC:", average_precision_score(y_test, proba_lgb))

[LightGBM] [Info] Number of positive: 1433, number of negative: 557
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005895 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 1990, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Confusion matrix:
 [[128  12]
 [ 10 348]]
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       140
           1       0.97      0.97      0.97       358

    accuracy                           0.96       498
   macro avg       0.95      0.94      0.95       498
weighted avg       0.96      0.96      0.96       498

ROC-AUC: 0.9795091779728651
PR-AUC: 0.9877211844875851

C:\Users\user\phishing_project\.venv\Lib\site-packages\sklearn\utils\validation.py:2691: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names
  warnings.warn(

import pandas as pd
from sklearn.impute import SimpleImputer
import lightgbm as lgb

imp = SimpleImputer(strategy="median")

X_train_i = pd.DataFrame(
    imp.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_test_i = pd.DataFrame(
    imp.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

lgbm = lgb.LGBMClassifier(
    n_estimators=800,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.9,
    colsample_bytree=0.9,
    class_weight="balanced",
    random_state=42
)

lgbm.fit(X_train_i, y_train)

proba_lgb = lgbm.predict_proba(X_test_i)[:, 1]

[LightGBM] [Info] Number of positive: 1433, number of negative: 557
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001947 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 1990, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf

import numpy as np
from sklearn.metrics import precision_recall_curve, confusion_matrix, classification_report, average_precision_score

prec, rec, thr = precision_recall_curve(y_test, proba_lgb)
f1 = 2*(prec*rec)/(prec+rec+1e-9)

best_idx = np.argmax(f1)
best_thr = thr[best_idx]
best_thr

np.float64(0.33997688500658646)

pred_thr = (proba_lgb >= best_thr).astype(int)

print("PR-AUC:", average_precision_score(y_test, proba_lgb))
print("Threshold:", best_thr)
print(confusion_matrix(y_test, pred_thr))
print(classification_report(y_test, pred_thr))

PR-AUC: 0.9877211844875851
Threshold: 0.33997688500658646
[[128  12]
 [  9 349]]
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       140
           1       0.97      0.97      0.97       358

    accuracy                           0.96       498
   macro avg       0.95      0.94      0.95       498
weighted avg       0.96      0.96      0.96       498

import shap
import matplotlib.pyplot as plt

# X_test_i should be a DataFrame with columns (recommended).
# If your X_test_i is numpy array, create DataFrame first:
# X_test_i = pd.DataFrame(X_test_i, columns=X_test.columns, index=X_test.index)

explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X_test_i)

shap.summary_plot(shap_values, X_test_i, plot_type="bar", show=False)
plt.title("SHAP Feature Importance (Phishing Prediction)")
plt.show()

C:\Users\user\phishing_project\.venv\Lib\site-packages\shap\explainers\_tree.py:587: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
  warnings.warn(

!pip install -U ipywidgets jupyterlab_widgets

Requirement already satisfied: ipywidgets in .\.venv\Lib\site-packages (8.1.8)
Requirement already satisfied: jupyterlab_widgets in .\.venv\Lib\site-packages (3.0.16)
Requirement already satisfied: comm>=0.1.3 in .\.venv\Lib\site-packages (from ipywidgets) (0.2.3)
Requirement already satisfied: ipython>=6.1.0 in .\.venv\Lib\site-packages (from ipywidgets) (9.10.0)
Requirement already satisfied: traitlets>=4.3.1 in .\.venv\Lib\site-packages (from ipywidgets) (5.14.3)
Requirement already satisfied: widgetsnbextension~=4.0.14 in .\.venv\Lib\site-packages (from ipywidgets) (4.0.15)
Requirement already satisfied: colorama>=0.4.4 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (0.4.6)
Requirement already satisfied: decorator>=4.3.2 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (5.2.1)
Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (1.1.1)
Requirement already satisfied: jedi>=0.18.1 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (0.19.2)
Requirement already satisfied: matplotlib-inline>=0.1.5 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (0.2.1)
Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (3.0.52)
Requirement already satisfied: pygments>=2.11.0 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (2.19.2)
Requirement already satisfied: stack_data>=0.6.0 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)
Requirement already satisfied: typing_extensions>=4.6 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (4.15.0)
Requirement already satisfied: wcwidth in .\.venv\Lib\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.5.3)
Requirement already satisfied: parso<0.9.0,>=0.8.4 in .\.venv\Lib\site-packages (from jedi>=0.18.1->ipython>=6.1.0->ipywidgets) (0.8.5)
Requirement already satisfied: executing>=1.2.0 in .\.venv\Lib\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (2.2.1)
Requirement already satisfied: asttokens>=2.1.0 in .\.venv\Lib\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (3.0.1)
Requirement already satisfied: pure-eval in .\.venv\Lib\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (0.2.3)

import pandas as pd

pred_thr = (proba_lgb >= best_thr).astype(int)

results = X_test.copy()
results["y_true"] = y_test.values
results["score"] = proba_lgb
results["y_pred"] = pred_thr

fp = results[(results.y_true==0) & (results.y_pred==1)]  # legit predicted phishing
fn = results[(results.y_true==1) & (results.y_pred==0)]  # phishing predicted legit

print("False Positives:", len(fp))
print("False Negatives:", len(fn))

fp.head(), fn.head()

False Positives: 12
False Negatives: 9

(      url_length  at_symbol  sensitive_words_count  path_length  isHttps  \
 1465          17          0                      0            0        1   
 2137          69          0                      1            0        0   
 1328          24          0                      0            1        0   
 1334          49          0                      0            3        0   
 1921          36          0                      0            4        0   
 
       nb_dots  nb_hyphens  nb_and  nb_or  nb_www  nb_com  nb_underscore  \
 1465        0           0       0      0       0       0              1   
 2137        8           0       0      0       0       0              0   
 1328        1           0       0      1       0       1              0   
 1334        7           1       0      0       0       1              0   
 1921        1           0       0      1       0       1              0   
 
       target  y_true     score  y_pred  
 1465       1       0  0.999958       1  
 2137       1       0  0.993204       1  
 1328       1       0  1.000000       1  
 1334       1       0  0.999978       1  
 1921       1       0  0.963093       1  ,
       url_length  at_symbol  sensitive_words_count  path_length  isHttps  \
 1336          61          0                      0           33        0   
 2437          34          0                      0           10        0   
 1346          54          0                      1           29        0   
 2349          23          0                      0            0        0   
 1951          57          0                      0            0        1   
 
       nb_dots  nb_hyphens  nb_and  nb_or  nb_www  nb_com  nb_underscore  \
 1336        5           0       0      0       0       1              0   
 2437        2           0       0      0       0       1              0   
 1346        6           0       0      0       0       1              0   
 2349        1           1       0      0       0       1              0   
 1951        5           0       0      0       1       0              1   
 
       target  y_true     score  y_pred  
 1336       1       1  0.008690       0  
 2437       1       1  0.013219       0  
 1346       1       1  0.001313       0  
 2349       1       1  0.076813       0  
 1951       1       1  0.014438       0  )

import joblib

joblib.dump(
    {
        "model": lgbm,
        "imputer": imp,
        "threshold": float(best_thr),
        "feature_names": list(X.columns)
    },
    "phishing_model_bundle.joblib"
)

print("Saved: phishing_model_bundle.joblib")

Saved: phishing_model_bundle.joblib

import numpy as np
import pandas as pd

bundle = joblib.load("phishing_model_bundle.joblib")
model = bundle["model"]
imputer = bundle["imputer"]
thr = bundle["threshold"]
feat = bundle["feature_names"]

def predict_from_features(feature_dict):
    x = pd.DataFrame([feature_dict])[feat]
    x_i = pd.DataFrame(imputer.transform(x), columns=feat)
    score = model.predict_proba(x_i)[:, 1][0]
    pred = int(score >= thr)
    return {"score": float(score), "prediction": pred}  # 1=phishing (if that is your setup)

# Take one sample from your test set
sample = X_test.iloc[0].to_dict()

out = predict_from_features(sample)
out

{'score': 0.9969901295708007, 'prediction': 1}

for i in range(5):
    out = predict_from_features(X_test.iloc[i].to_dict())
    print(i, out)

0 {'score': 0.9969901295708007, 'prediction': 1}
1 {'score': 0.9999999664536476, 'prediction': 1}
2 {'score': 0.9998004515321444, 'prediction': 1}
3 {'score': 0.9999999996524043, 'prediction': 1}
4 {'score': 4.004544153977602e-08, 'prediction': 0}

i = 0
print("True:", int(y_test.iloc[i]))
print("Pred:", predict_from_features(X_test.iloc[i].to_dict()))

True: 1
Pred: {'score': 0.9969901295708007, 'prediction': 1}

def policy(score, block=0.80, warn=0.50):
    if score >= block:
        return "BLOCK"
    elif score >= warn:
        return "WARN"
    else:
        return "ALLOW"

# Example on some test predictions
scores = model.predict_proba(pd.DataFrame(imputer.transform(X_test[feat]), columns=feat))[:, 1]
pd.Series([policy(s) for s in scores]).value_counts()

BLOCK    360
ALLOW    138
Name: count, dtype: int64

scores = model.predict_proba(pd.DataFrame(imputer.transform(X_test[feat]), columns=feat))[:, 1]
preds = (scores >= thr).astype(int)

export_df = X_test.copy()
export_df["y_true"] = y_test.values
export_df["score"] = scores
export_df["y_pred"] = preds
export_df["action"] = [policy(s) for s in scores]

export_df.to_csv("outputs_test_predictions.csv", index=False)
"Saved outputs_test_predictions.csv"

'Saved outputs_test_predictions.csv'

RANDOM_STATE = 42
TEST_SIZE = 0.20
print("RANDOM_STATE:", RANDOM_STATE, "TEST_SIZE:", TEST_SIZE)

RANDOM_STATE: 42 TEST_SIZE: 0.2

	steps steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.	[('imputer', ...), ('scaler', ...), ...]
	transform_input transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6	None
	memory memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.	None
	verbose verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.	False

	missing_values missing_values: int, float, str, np.nan, None or pandas.NA, default=np.nan The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For pandas' dataframes with nullable integer dtypes with missing values, `missing_values` can be set to either `np.nan` or `pd.NA`.	nan
	strategy strategy: str or Callable, default='mean' The imputation strategy. - If "mean", then replace missing values using the mean along each column. Can only be used with numeric data. - If "median", then replace missing values using the median along each column. Can only be used with numeric data. - If "most_frequent", then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned. - If "constant", then replace missing values with fill_value. Can be used with strings or numeric data. - If an instance of Callable, then replace missing values using the scalar statistic returned by running the callable over a dense 1d array containing non-missing values of each column. .. versionadded:: 0.20 strategy="constant" for fixed value imputation. .. versionadded:: 1.5 strategy=callable for custom value imputation.	'median'
	fill_value fill_value: str or numerical value, default=None When strategy == "constant", `fill_value` is used to replace all occurrences of missing_values. For string or object data types, `fill_value` must be a string. If `None`, `fill_value` will be 0 when imputing numerical data and "missing_value" for strings or object data types.	None
	copy copy: bool, default=True If True, a copy of X will be created. If False, imputation will be done in-place whenever possible. Note that, in the following cases, a new copy will always be made, even if `copy=False`: - If `X` is not an array of floating values; - If `X` is encoded as a CSR matrix; - If `add_indicator=True`.	True
	add_indicator add_indicator: bool, default=False If True, a :class:`MissingIndicator` transform will stack onto output of the imputer's transform. This allows a predictive estimator to account for missingness despite imputation. If a feature has no missing values at fit/train time, the feature won't appear on the missing indicator even if there are missing values at transform/test time.	False
	keep_empty_features keep_empty_features: bool, default=False If True, features that consist exclusively of missing values when `fit` is called are returned in results when `transform` is called. The imputed value is always `0` except when `strategy="constant"` in which case `fill_value` will be used instead. .. versionadded:: 1.2	False

	penalty penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning:: Some penalties may not work with some solvers. See the parameter `solver` below, to know the compatibility between the penalty and solver. .. versionadded:: 0.19 l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8 `penalty` was deprecated in version 1.8 and will be removed in 1.10. Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for `'penalty='elasticnet'`.	'deprecated'
	C C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.	1.0
	l1_ratio l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning:: Certain values of `l1_ratio`, i.e. some penalties, may not work with some solvers. See the parameter `solver` below, to know the compatibility between the penalty and solver. .. versionchanged:: 1.8 Default value changed from None to 0.0. .. deprecated:: 1.8 `None` is deprecated and will be removed in version 1.10. Always use `l1_ratio` to specify the penalty type.	0.0
	dual dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.	False
	tol tol: float, default=1e-4 Tolerance for stopping criteria.	0.0001
	fit_intercept fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.	True
	intercept_scaling intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a "synthetic" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note:: The synthetic feature weight is subject to L1 or L2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) `intercept_scaling` has to be increased.	1
	class_weight class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17 class_weight='balanced'	'balanced'
	random_state random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.	None
	solver solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except 'liblinear' minimize the full multinomial loss, 'liblinear' will raise an error. - 'newton-cholesky' is a good choice for `n_samples` >> `n_features * n_classes`, especially with one-hot encoded categorical features with rare categories. Be aware that the memory usage of this solver has a quadratic dependency on `n_features * n_classes` because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag' and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a one-versus-rest scheme for the multiclass setting one can wrap it with the :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning:: The choice of the algorithm depends on the penalty chosen (`l1_ratio=0` for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for Elastic-Net) and on (multinomial) multiclass support: ================= ======================== ====================== solver l1_ratio multinomial multiclass ================= ======================== ====================== 'lbfgs' l1_ratio=0 yes 'liblinear' l1_ratio=1 or l1_ratio=0 no 'newton-cg' l1_ratio=0 yes 'newton-cholesky' l1_ratio=0 yes 'sag' l1_ratio=0 yes 'saga' 0<=l1_ratio<=1 yes ================= ======================== ====================== .. note:: 'sag' and 'saga' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from :mod:`sklearn.preprocessing`. .. seealso:: Refer to the :ref:`User Guide ` for more information regarding :class:`LogisticRegression` and more specifically the :ref:`Table ` summarizing solver/penalty supports. .. versionadded:: 0.17 Stochastic Average Gradient (SAG) descent solver. Multinomial support in version 0.18. .. versionadded:: 0.19 SAGA solver. .. versionchanged:: 0.22 The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2 newton-cholesky solver. Multinomial support in version 1.6.	'lbfgs'
	max_iter max_iter: int, default=100 Maximum number of iterations taken for the solvers to converge.	500
	verbose verbose: int, default=0 For the liblinear and lbfgs solvers set verbose to any positive number for verbosity.	0
	warm_start warm_start: bool, default=False When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Useless for liblinear solver. See :term:`the Glossary `. .. versionadded:: 0.17 warm_start to support lbfgs, newton-cg, sag, saga solvers.	False
	n_jobs n_jobs: int, default=None Does not have any effect. .. deprecated:: 1.8 `n_jobs` is deprecated in version 1.8 and will be removed in 1.10.	None

	url_length	path_length	nb_dots	nb_hyphens	nb_or	nb_www	nb_com
0	42	20	2	0	0	1	1
1	73	52	5	0	0	0	1
2	73	52	5	0	0	0	1
3	73	52	5	1	1	0	1
4	73	52	5	0	0	0	1

	url_length	path_length	nb_dots	nb_hyphens	nb_or	nb_www	nb_com
0	42	20	2	0	0	1	1
1	73	52	5	0	0	0	1
2	73	52	5	0	0	0	1
3	73	52	5	1	1	0	1
4	73	52	5	0	0	0	1

	url_length	path_length	nb_dots	nb_hyphens	nb_or	nb_www	nb_com
0	42	20	2	0	0	1	1
1	73	52	5	0	0	0	1
2	73	52	5	0	0	0	1
3	73	52	5	1	1	0	1
4	73	52	5	0	0	0	1

	url_length	path_length	nb_dots	nb_hyphens	nb_or	nb_www	nb_com
0	42	20	2	0	0	1	1
1	73	52	5	0	0	0	1
2	73	52	5	0	0	0	1
3	73	52	5	1	1	0	1
4	73	52	5	0	0	0	1

	copy copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.	True
	with_mean with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.	True
	with_std with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).	True

	url_length	path_length	nb_dots	nb_hyphens	nb_or	nb_www	nb_com
0	42	20	2	0	0	1	1
1	73	52	5	0	0	0	1
2	73	52	5	0	0	0	1
3	73	52	5	1	1	0	1
4	73	52	5	0	0	0	1

	url_length	path_length	nb_dots	nb_hyphens	nb_or	nb_www	nb_com
0	42	20	2	0	0	1	1
1	73	52	5	0	0	0	1
2	73	52	5	0	0	0	1
3	73	52	5	1	1	0	1
4	73	52	5	0	0	0	1

	url_length	path_length	nb_dots	nb_hyphens	nb_or	nb_www	nb_com
0	42	20	2	0	0	1	1
1	73	52	5	0	0	0	1
2	73	52	5	0	0	0	1
3	73	52	5	1	1	0	1
4	73	52	5	0	0	0	1

	url_length	path_length	nb_dots	nb_hyphens	nb_or	nb_www	nb_com
0	42	20	2	0	0	1	1
1	73	52	5	0	0	0	1
2	73	52	5	0	0	0	1
3	73	52	5	1	1	0	1
4	73	52	5	0	0	0	1

	url_length	path_length	nb_dots	nb_hyphens	nb_or	nb_www	nb_com
0	42	20	2	0	0	1	1
1	73	52	5	0	0	0	1
2	73	52	5	0	0	0	1
3	73	52	5	1	1	0	1
4	73	52	5	0	0	0	1

	url_length	path_length	nb_dots	nb_hyphens	nb_or	nb_www	nb_com
0	42	20	2	0	0	1	1
1	73	52	5	0	0	0	1
2	73	52	5	0	0	0	1
3	73	52	5	1	1	0	1
4	73	52	5	0	0	0	1

	url_length	path_length	nb_dots	nb_hyphens	nb_or	nb_www	nb_com
0	42	20	2	0	0	1	1
1	73	52	5	0	0	0	1
2	73	52	5	0	0	0	1
3	73	52	5	1	1	0	1
4	73	52	5	0	0	0	1

	url_length	path_length	nb_dots	nb_hyphens	nb_or	nb_www	nb_com
0	42	20	2	0	0	1	1
1	73	52	5	0	0	0	1
2	73	52	5	0	0	0	1
3	73	52	5	1	1	0	1
4	73	52	5	0	0	0	1