In [1]:
import pandas as pd

df = pd.read_csv("phishing_url_dataset.csv")
print(df.shape)
df.head()
(2488, 14)
Out[1]:
url_length valid_url at_symbol sensitive_words_count path_length isHttps nb_dots nb_hyphens nb_and nb_or nb_www nb_com nb_underscore target
0 42 0 0 0 20 0 2 0 0 0 1 1 0 0
1 73 0 0 0 52 0 5 0 0 0 0 1 0 0
2 73 0 0 0 52 0 5 0 0 0 0 1 0 0
3 73 0 0 0 52 0 5 1 0 1 0 1 0 0
4 73 0 0 0 52 0 5 0 0 0 0 1 0 0
In [2]:
import os
print("Current folder:", os.getcwd())
print("Files here:", os.listdir("."))
Current folder: C:\Users\user\phishing_project
Files here: ['.ipynb_checkpoints', '.venv', 'phishing_detection.ipynb', 'phishing_url_dataset.csv']
In [3]:
import pandas as pd
df = pd.read_csv("phishing_url_dataset.csv")
df.head()
Out[3]:
url_length valid_url at_symbol sensitive_words_count path_length isHttps nb_dots nb_hyphens nb_and nb_or nb_www nb_com nb_underscore target
0 42 0 0 0 20 0 2 0 0 0 1 1 0 0
1 73 0 0 0 52 0 5 0 0 0 0 1 0 0
2 73 0 0 0 52 0 5 0 0 0 0 1 0 0
3 73 0 0 0 52 0 5 1 0 1 0 1 0 0
4 73 0 0 0 52 0 5 0 0 0 0 1 0 0
In [4]:
import pandas as pd

df = pd.read_csv("phishing_url_dataset.csv")
print(df.shape)
print(df.columns.tolist())
df.head()
(2488, 14)
['url_length', 'valid_url', 'at_symbol', 'sensitive_words_count', 'path_length', 'isHttps', 'nb_dots', 'nb_hyphens', 'nb_and', 'nb_or', 'nb_www', 'nb_com', 'nb_underscore', 'target']
Out[4]:
url_length valid_url at_symbol sensitive_words_count path_length isHttps nb_dots nb_hyphens nb_and nb_or nb_www nb_com nb_underscore target
0 42 0 0 0 20 0 2 0 0 0 1 1 0 0
1 73 0 0 0 52 0 5 0 0 0 0 1 0 0
2 73 0 0 0 52 0 5 0 0 0 0 1 0 0
3 73 0 0 0 52 0 5 1 0 1 0 1 0 0
4 73 0 0 0 52 0 5 0 0 0 0 1 0 0
In [5]:
for c in df.columns:
    if df[c].nunique() <= 10:
        print("\n", c)
        print(df[c].value_counts())
 valid_url
valid_url
0    1791
1     697
Name: count, dtype: int64

 at_symbol
at_symbol
0    2482
1       4
8       1
9       1
Name: count, dtype: int64

 sensitive_words_count
sensitive_words_count
0    1934
1     532
2      20
3       2
Name: count, dtype: int64

 isHttps
isHttps
0    1424
1    1064
Name: count, dtype: int64

 nb_hyphens
nb_hyphens
0     1810
1      318
2      154
4       74
3       73
5       36
6       14
7        6
10       3
Name: count, dtype: int64

 nb_and
nb_and
0    2451
1      36
2       1
Name: count, dtype: int64

 nb_or
nb_or
0    2083
1     369
2      31
3       5
Name: count, dtype: int64

 nb_www
nb_www
0    1932
1     541
2      15
Name: count, dtype: int64

 nb_com
nb_com
1    1472
0     949
2      63
3       4
Name: count, dtype: int64

 nb_underscore
nb_underscore
0    2241
1     185
2      34
3      22
4       3
6       1
5       1
7       1
Name: count, dtype: int64

 target
target
0    1313
1    1175
Name: count, dtype: int64
In [6]:
target_col = "valid_url"

# 1 = phishing (positive class), 0 = legitimate
df["is_phishing"] = (df[target_col] == 0).astype(int)

df["is_phishing"].value_counts()
Out[6]:
is_phishing
1    1791
0     697
Name: count, dtype: int64
In [7]:
import numpy as np

# clean any inf values if present
df = df.replace([np.inf, -np.inf], np.nan)

X = df.drop(columns=[target_col, "is_phishing"], errors="ignore")
y = df["is_phishing"]

# keep only numeric columns (safe if there is a url text column)
X = X.select_dtypes(include=["number"])

print(X.shape, y.shape)
X.head()
(2488, 13) (2488,)
Out[7]:
url_length at_symbol sensitive_words_count path_length isHttps nb_dots nb_hyphens nb_and nb_or nb_www nb_com nb_underscore target
0 42 0 0 20 0 2 0 0 0 1 1 0 0
1 73 0 0 52 0 5 0 0 0 0 1 0 0
2 73 0 0 52 0 5 0 0 0 0 1 0 0
3 73 0 0 52 0 5 1 0 1 0 1 0 0
4 73 0 0 52 0 5 0 0 0 0 1 0 0
In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)
Out[8]:
(is_phishing
 1    0.720101
 0    0.279899
 Name: proportion, dtype: float64,
 is_phishing
 1    0.718876
 0    0.281124
 Name: proportion, dtype: float64)
In [9]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

baseline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=500, class_weight="balanced"))
])

baseline.fit(X_train, y_train)
Out[9]:
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('model',
                 LogisticRegression(class_weight='balanced', max_iter=500))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
steps steps: list of tuples

List of (name of step, estimator) tuples that are to be chained in
sequential order. To be compatible with the scikit-learn API, all steps
must define `fit`. All non-last steps must also define `transform`. See
:ref:`Combining Estimators ` for more details.
[('imputer', ...), ('scaler', ...), ...]
transform_input transform_input: list of str, default=None

The names of the :term:`metadata` parameters that should be transformed by the
pipeline before passing it to the step consuming it.

This enables transforming some input arguments to ``fit`` (other than ``X``)
to be transformed by the steps of the pipeline up to the step which requires
them. Requirement is defined via :ref:`metadata routing `.
For instance, this can be used to pass a validation set through the pipeline.

You can only set this if metadata routing is enabled, which you
can enable using ``sklearn.set_config(enable_metadata_routing=True)``.

.. versionadded:: 1.6
None
memory memory: str or object with the joblib.Memory interface, default=None

Used to cache the fitted transformers of the pipeline. The last step
will never be cached, even if it is a transformer. By default, no
caching is performed. If a string is given, it is the path to the
caching directory. Enabling caching triggers a clone of the transformers
before fitting. Therefore, the transformer instance given to the
pipeline cannot be inspected directly. Use the attribute ``named_steps``
or ``steps`` to inspect estimators within the pipeline. Caching the
transformers is advantageous when fitting is time consuming. See
:ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py`
for an example on how to enable caching.
None
verbose verbose: bool, default=False

If True, the time elapsed while fitting each step will be printed as it
is completed.
False
Parameters
missing_values missing_values: int, float, str, np.nan, None or pandas.NA, default=np.nan

The placeholder for the missing values. All occurrences of
`missing_values` will be imputed. For pandas' dataframes with
nullable integer dtypes with missing values, `missing_values`
can be set to either `np.nan` or `pd.NA`.
nan
strategy strategy: str or Callable, default='mean'

The imputation strategy.

- If "mean", then replace missing values using the mean along
each column. Can only be used with numeric data.
- If "median", then replace missing values using the median along
each column. Can only be used with numeric data.
- If "most_frequent", then replace missing using the most frequent
value along each column. Can be used with strings or numeric data.
If there is more than one such value, only the smallest is returned.
- If "constant", then replace missing values with fill_value. Can be
used with strings or numeric data.
- If an instance of Callable, then replace missing values using the
scalar statistic returned by running the callable over a dense 1d
array containing non-missing values of each column.

.. versionadded:: 0.20
strategy="constant" for fixed value imputation.

.. versionadded:: 1.5
strategy=callable for custom value imputation.
'median'
fill_value fill_value: str or numerical value, default=None

When strategy == "constant", `fill_value` is used to replace all
occurrences of missing_values. For string or object data types,
`fill_value` must be a string.
If `None`, `fill_value` will be 0 when imputing numerical
data and "missing_value" for strings or object data types.
None
copy copy: bool, default=True

If True, a copy of X will be created. If False, imputation will
be done in-place whenever possible. Note that, in the following cases,
a new copy will always be made, even if `copy=False`:

- If `X` is not an array of floating values;
- If `X` is encoded as a CSR matrix;
- If `add_indicator=True`.
True
add_indicator add_indicator: bool, default=False

If True, a :class:`MissingIndicator` transform will stack onto output
of the imputer's transform. This allows a predictive estimator
to account for missingness despite imputation. If a feature has no
missing values at fit/train time, the feature won't appear on
the missing indicator even if there are missing values at
transform/test time.
False
keep_empty_features keep_empty_features: bool, default=False

If True, features that consist exclusively of missing values when
`fit` is called are returned in results when `transform` is called.
The imputed value is always `0` except when `strategy="constant"`
in which case `fill_value` will be used instead.

.. versionadded:: 1.2
False
Parameters
copy copy: bool, default=True

If False, try to avoid a copy and do inplace scaling instead.
This is not guaranteed to always work inplace; e.g. if the data is
not a NumPy array or scipy.sparse CSR matrix, a copy may still be
returned.
True
with_mean with_mean: bool, default=True

If True, center the data before scaling.
This does not work (and will raise an exception) when attempted on
sparse matrices, because centering them entails building a dense
matrix which in common use cases is likely to be too large to fit in
memory.
True
with_std with_std: bool, default=True

If True, scale the data to unit variance (or equivalently,
unit standard deviation).
True
Parameters
penalty penalty: {'l1', 'l2', 'elasticnet', None}, default='l2'

Specify the norm of the penalty:

- `None`: no penalty is added;
- `'l2'`: add a L2 penalty term and it is the default choice;
- `'l1'`: add a L1 penalty term;
- `'elasticnet'`: both L1 and L2 penalty terms are added.

.. warning::
Some penalties may not work with some solvers. See the parameter
`solver` below, to know the compatibility between the penalty and
solver.

.. versionadded:: 0.19
l1 penalty with SAGA solver (allowing 'multinomial' + L1)

.. deprecated:: 1.8
`penalty` was deprecated in version 1.8 and will be removed in 1.10.
Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for
`penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for
`'penalty='elasticnet'`.
'deprecated'
C C: float, default=1.0

Inverse of regularization strength; must be a positive float.
Like in support vector machines, smaller values specify stronger
regularization. `C=np.inf` results in unpenalized logistic regression.
For a visual example on the effect of tuning the `C` parameter
with an L1 penalty, see:
:ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.
1.0
l1_ratio l1_ratio: float, default=0.0

The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting
`l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty.
Any value between 0 and 1 gives an Elastic-Net penalty of the form
`l1_ratio * L1 + (1 - l1_ratio) * L2`.

.. warning::
Certain values of `l1_ratio`, i.e. some penalties, may not work with some
solvers. See the parameter `solver` below, to know the compatibility between
the penalty and solver.

.. versionchanged:: 1.8
Default value changed from None to 0.0.

.. deprecated:: 1.8
`None` is deprecated and will be removed in version 1.10. Always use
`l1_ratio` to specify the penalty type.
0.0
dual dual: bool, default=False

Dual (constrained) or primal (regularized, see also
:ref:`this equation `) formulation. Dual formulation
is only implemented for l2 penalty with liblinear solver. Prefer `dual=False`
when n_samples > n_features.
False
tol tol: float, default=1e-4

Tolerance for stopping criteria.
0.0001
fit_intercept fit_intercept: bool, default=True

Specifies if a constant (a.k.a. bias or intercept) should be
added to the decision function.
True
intercept_scaling intercept_scaling: float, default=1

Useful only when the solver `liblinear` is used
and `self.fit_intercept` is set to `True`. In this case, `x` becomes
`[x, self.intercept_scaling]`,
i.e. a "synthetic" feature with constant value equal to
`intercept_scaling` is appended to the instance vector.
The intercept becomes
``intercept_scaling * synthetic_feature_weight``.

.. note::
The synthetic feature weight is subject to L1 or L2
regularization as all other features.
To lessen the effect of regularization on synthetic feature weight
(and therefore on the intercept) `intercept_scaling` has to be increased.
1
class_weight class_weight: dict or 'balanced', default=None

Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one.

The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``.

Note that these weights will be multiplied with sample_weight (passed
through the fit method) if sample_weight is specified.

.. versionadded:: 0.17
*class_weight='balanced'*
'balanced'
random_state random_state: int, RandomState instance, default=None

Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the
data. See :term:`Glossary ` for details.
None
solver solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs'

Algorithm to use in the optimization problem. Default is 'lbfgs'.
To choose a solver, you might want to consider the following aspects:

- 'lbfgs' is a good default solver because it works reasonably well for a wide
class of problems.
- For :term:`multiclass` problems (`n_classes >= 3`), all solvers except
'liblinear' minimize the full multinomial loss, 'liblinear' will raise an
error.
- 'newton-cholesky' is a good choice for
`n_samples` >> `n_features * n_classes`, especially with one-hot encoded
categorical features with rare categories. Be aware that the memory usage
of this solver has a quadratic dependency on `n_features * n_classes`
because it explicitly computes the full Hessian matrix.
- For small datasets, 'liblinear' is a good choice, whereas 'sag'
and 'saga' are faster for large ones;
- 'liblinear' can only handle binary classification by default. To apply a
one-versus-rest scheme for the multiclass setting one can wrap it with the
:class:`~sklearn.multiclass.OneVsRestClassifier`.

.. warning::
The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`
for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for
Elastic-Net) and on (multinomial) multiclass support:

================= ======================== ======================
solver l1_ratio multinomial multiclass
================= ======================== ======================
'lbfgs' l1_ratio=0 yes
'liblinear' l1_ratio=1 or l1_ratio=0 no
'newton-cg' l1_ratio=0 yes
'newton-cholesky' l1_ratio=0 yes
'sag' l1_ratio=0 yes
'saga' 0<=l1_ratio<=1 yes
================= ======================== ======================

.. note::
'sag' and 'saga' fast convergence is only guaranteed on features
with approximately the same scale. You can preprocess the data with
a scaler from :mod:`sklearn.preprocessing`.

.. seealso::
Refer to the :ref:`User Guide ` for more
information regarding :class:`LogisticRegression` and more specifically the
:ref:`Table `
summarizing solver/penalty supports.

.. versionadded:: 0.17
Stochastic Average Gradient (SAG) descent solver. Multinomial support in
version 0.18.
.. versionadded:: 0.19
SAGA solver.
.. versionchanged:: 0.22
The default solver changed from 'liblinear' to 'lbfgs' in 0.22.
.. versionadded:: 1.2
newton-cholesky solver. Multinomial support in version 1.6.
'lbfgs'
max_iter max_iter: int, default=100

Maximum number of iterations taken for the solvers to converge.
500
verbose verbose: int, default=0

For the liblinear and lbfgs solvers set verbose to any positive
number for verbosity.
0
warm_start warm_start: bool, default=False

When set to True, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution.
Useless for liblinear solver. See :term:`the Glossary `.

.. versionadded:: 0.17
*warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers.
False
n_jobs n_jobs: int, default=None

Does not have any effect.

.. deprecated:: 1.8
`n_jobs` is deprecated in version 1.8 and will be removed in 1.10.
None
In [10]:
from sklearn.metrics import (confusion_matrix, classification_report, 
                             roc_auc_score, average_precision_score)

pred = baseline.predict(X_test)
proba = baseline.predict_proba(X_test)[:, 1]

print("Confusion matrix:\n", confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print("ROC-AUC:", roc_auc_score(y_test, proba))
print("PR-AUC:", average_precision_score(y_test, proba))
Confusion matrix:
 [[123  17]
 [ 60 298]]
              precision    recall  f1-score   support

           0       0.67      0.88      0.76       140
           1       0.95      0.83      0.89       358

    accuracy                           0.85       498
   macro avg       0.81      0.86      0.82       498
weighted avg       0.87      0.85      0.85       498

ROC-AUC: 0.9333998403830807
PR-AUC: 0.9710189862716602
In [11]:
import lightgbm as lgb

imp = SimpleImputer(strategy="median")
X_train_i = imp.fit_transform(X_train)
X_test_i  = imp.transform(X_test)

lgbm = lgb.LGBMClassifier(
    n_estimators=800,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.9,
    colsample_bytree=0.9,
    class_weight="balanced",
    random_state=42
)

lgbm.fit(X_train_i, y_train)

proba_lgb = lgbm.predict_proba(X_test_i)[:, 1]
pred_lgb = (proba_lgb >= 0.5).astype(int)

print("Confusion matrix:\n", confusion_matrix(y_test, pred_lgb))
print(classification_report(y_test, pred_lgb))
print("ROC-AUC:", roc_auc_score(y_test, proba_lgb))
print("PR-AUC:", average_precision_score(y_test, proba_lgb))
[LightGBM] [Info] Number of positive: 1433, number of negative: 557
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005895 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 1990, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
Confusion matrix:
 [[128  12]
 [ 10 348]]
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       140
           1       0.97      0.97      0.97       358

    accuracy                           0.96       498
   macro avg       0.95      0.94      0.95       498
weighted avg       0.96      0.96      0.96       498

ROC-AUC: 0.9795091779728651
PR-AUC: 0.9877211844875851
C:\Users\user\phishing_project\.venv\Lib\site-packages\sklearn\utils\validation.py:2691: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names
  warnings.warn(
In [12]:
import pandas as pd
from sklearn.impute import SimpleImputer
import lightgbm as lgb

imp = SimpleImputer(strategy="median")

X_train_i = pd.DataFrame(
    imp.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_test_i = pd.DataFrame(
    imp.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

lgbm = lgb.LGBMClassifier(
    n_estimators=800,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.9,
    colsample_bytree=0.9,
    class_weight="balanced",
    random_state=42
)

lgbm.fit(X_train_i, y_train)

proba_lgb = lgbm.predict_proba(X_test_i)[:, 1]
[LightGBM] [Info] Number of positive: 1433, number of negative: 557
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001947 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 1990, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
In [13]:
import numpy as np
from sklearn.metrics import precision_recall_curve, confusion_matrix, classification_report, average_precision_score

prec, rec, thr = precision_recall_curve(y_test, proba_lgb)
f1 = 2*(prec*rec)/(prec+rec+1e-9)

best_idx = np.argmax(f1)
best_thr = thr[best_idx]
best_thr
Out[13]:
np.float64(0.33997688500658646)
In [14]:
pred_thr = (proba_lgb >= best_thr).astype(int)

print("PR-AUC:", average_precision_score(y_test, proba_lgb))
print("Threshold:", best_thr)
print(confusion_matrix(y_test, pred_thr))
print(classification_report(y_test, pred_thr))
PR-AUC: 0.9877211844875851
Threshold: 0.33997688500658646
[[128  12]
 [  9 349]]
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       140
           1       0.97      0.97      0.97       358

    accuracy                           0.96       498
   macro avg       0.95      0.94      0.95       498
weighted avg       0.96      0.96      0.96       498

In [17]:
import shap
import matplotlib.pyplot as plt

# X_test_i should be a DataFrame with columns (recommended).
# If your X_test_i is numpy array, create DataFrame first:
# X_test_i = pd.DataFrame(X_test_i, columns=X_test.columns, index=X_test.index)

explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X_test_i)

shap.summary_plot(shap_values, X_test_i, plot_type="bar", show=False)
plt.title("SHAP Feature Importance (Phishing Prediction)")
plt.show()
C:\Users\user\phishing_project\.venv\Lib\site-packages\shap\explainers\_tree.py:587: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
  warnings.warn(
No description has been provided for this image
In [16]:
!pip install -U ipywidgets jupyterlab_widgets
Requirement already satisfied: ipywidgets in .\.venv\Lib\site-packages (8.1.8)
Requirement already satisfied: jupyterlab_widgets in .\.venv\Lib\site-packages (3.0.16)
Requirement already satisfied: comm>=0.1.3 in .\.venv\Lib\site-packages (from ipywidgets) (0.2.3)
Requirement already satisfied: ipython>=6.1.0 in .\.venv\Lib\site-packages (from ipywidgets) (9.10.0)
Requirement already satisfied: traitlets>=4.3.1 in .\.venv\Lib\site-packages (from ipywidgets) (5.14.3)
Requirement already satisfied: widgetsnbextension~=4.0.14 in .\.venv\Lib\site-packages (from ipywidgets) (4.0.15)
Requirement already satisfied: colorama>=0.4.4 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (0.4.6)
Requirement already satisfied: decorator>=4.3.2 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (5.2.1)
Requirement already satisfied: ipython-pygments-lexers>=1.0.0 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (1.1.1)
Requirement already satisfied: jedi>=0.18.1 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (0.19.2)
Requirement already satisfied: matplotlib-inline>=0.1.5 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (0.2.1)
Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (3.0.52)
Requirement already satisfied: pygments>=2.11.0 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (2.19.2)
Requirement already satisfied: stack_data>=0.6.0 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)
Requirement already satisfied: typing_extensions>=4.6 in .\.venv\Lib\site-packages (from ipython>=6.1.0->ipywidgets) (4.15.0)
Requirement already satisfied: wcwidth in .\.venv\Lib\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.5.3)
Requirement already satisfied: parso<0.9.0,>=0.8.4 in .\.venv\Lib\site-packages (from jedi>=0.18.1->ipython>=6.1.0->ipywidgets) (0.8.5)
Requirement already satisfied: executing>=1.2.0 in .\.venv\Lib\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (2.2.1)
Requirement already satisfied: asttokens>=2.1.0 in .\.venv\Lib\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (3.0.1)
Requirement already satisfied: pure-eval in .\.venv\Lib\site-packages (from stack_data>=0.6.0->ipython>=6.1.0->ipywidgets) (0.2.3)
In [18]:
import pandas as pd

pred_thr = (proba_lgb >= best_thr).astype(int)

results = X_test.copy()
results["y_true"] = y_test.values
results["score"] = proba_lgb
results["y_pred"] = pred_thr

fp = results[(results.y_true==0) & (results.y_pred==1)]  # legit predicted phishing
fn = results[(results.y_true==1) & (results.y_pred==0)]  # phishing predicted legit

print("False Positives:", len(fp))
print("False Negatives:", len(fn))

fp.head(), fn.head()
False Positives: 12
False Negatives: 9
Out[18]:
(      url_length  at_symbol  sensitive_words_count  path_length  isHttps  \
 1465          17          0                      0            0        1   
 2137          69          0                      1            0        0   
 1328          24          0                      0            1        0   
 1334          49          0                      0            3        0   
 1921          36          0                      0            4        0   
 
       nb_dots  nb_hyphens  nb_and  nb_or  nb_www  nb_com  nb_underscore  \
 1465        0           0       0      0       0       0              1   
 2137        8           0       0      0       0       0              0   
 1328        1           0       0      1       0       1              0   
 1334        7           1       0      0       0       1              0   
 1921        1           0       0      1       0       1              0   
 
       target  y_true     score  y_pred  
 1465       1       0  0.999958       1  
 2137       1       0  0.993204       1  
 1328       1       0  1.000000       1  
 1334       1       0  0.999978       1  
 1921       1       0  0.963093       1  ,
       url_length  at_symbol  sensitive_words_count  path_length  isHttps  \
 1336          61          0                      0           33        0   
 2437          34          0                      0           10        0   
 1346          54          0                      1           29        0   
 2349          23          0                      0            0        0   
 1951          57          0                      0            0        1   
 
       nb_dots  nb_hyphens  nb_and  nb_or  nb_www  nb_com  nb_underscore  \
 1336        5           0       0      0       0       1              0   
 2437        2           0       0      0       0       1              0   
 1346        6           0       0      0       0       1              0   
 2349        1           1       0      0       0       1              0   
 1951        5           0       0      0       1       0              1   
 
       target  y_true     score  y_pred  
 1336       1       1  0.008690       0  
 2437       1       1  0.013219       0  
 1346       1       1  0.001313       0  
 2349       1       1  0.076813       0  
 1951       1       1  0.014438       0  )
In [19]:
import joblib

joblib.dump(
    {
        "model": lgbm,
        "imputer": imp,
        "threshold": float(best_thr),
        "feature_names": list(X.columns)
    },
    "phishing_model_bundle.joblib"
)

print("Saved: phishing_model_bundle.joblib")
Saved: phishing_model_bundle.joblib
In [20]:
import numpy as np
import pandas as pd

bundle = joblib.load("phishing_model_bundle.joblib")
model = bundle["model"]
imputer = bundle["imputer"]
thr = bundle["threshold"]
feat = bundle["feature_names"]

def predict_from_features(feature_dict):
    x = pd.DataFrame([feature_dict])[feat]
    x_i = pd.DataFrame(imputer.transform(x), columns=feat)
    score = model.predict_proba(x_i)[:, 1][0]
    pred = int(score >= thr)
    return {"score": float(score), "prediction": pred}  # 1=phishing (if that is your setup)
In [21]:
# Take one sample from your test set
sample = X_test.iloc[0].to_dict()

out = predict_from_features(sample)
out
Out[21]:
{'score': 0.9969901295708007, 'prediction': 1}
In [22]:
for i in range(5):
    out = predict_from_features(X_test.iloc[i].to_dict())
    print(i, out)
0 {'score': 0.9969901295708007, 'prediction': 1}
1 {'score': 0.9999999664536476, 'prediction': 1}
2 {'score': 0.9998004515321444, 'prediction': 1}
3 {'score': 0.9999999996524043, 'prediction': 1}
4 {'score': 4.004544153977602e-08, 'prediction': 0}
In [23]:
i = 0
print("True:", int(y_test.iloc[i]))
print("Pred:", predict_from_features(X_test.iloc[i].to_dict()))
True: 1
Pred: {'score': 0.9969901295708007, 'prediction': 1}
In [24]:
def policy(score, block=0.80, warn=0.50):
    if score >= block:
        return "BLOCK"
    elif score >= warn:
        return "WARN"
    else:
        return "ALLOW"

# Example on some test predictions
scores = model.predict_proba(pd.DataFrame(imputer.transform(X_test[feat]), columns=feat))[:, 1]
pd.Series([policy(s) for s in scores]).value_counts()
Out[24]:
BLOCK    360
ALLOW    138
Name: count, dtype: int64
In [25]:
scores = model.predict_proba(pd.DataFrame(imputer.transform(X_test[feat]), columns=feat))[:, 1]
preds = (scores >= thr).astype(int)

export_df = X_test.copy()
export_df["y_true"] = y_test.values
export_df["score"] = scores
export_df["y_pred"] = preds
export_df["action"] = [policy(s) for s in scores]

export_df.to_csv("outputs_test_predictions.csv", index=False)
"Saved outputs_test_predictions.csv"
Out[25]:
'Saved outputs_test_predictions.csv'
In [26]:
RANDOM_STATE = 42
TEST_SIZE = 0.20
print("RANDOM_STATE:", RANDOM_STATE, "TEST_SIZE:", TEST_SIZE)
RANDOM_STATE: 42 TEST_SIZE: 0.2
In [ ]: