# Import necessary packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
# Load cleaned datasets
= pd.read_csv('../../data/processed-data/crypto_returns_cleaned.csv')
crypto_returns = pd.read_csv('../../data/processed-data/fx_rates_cleaned.csv')
fx_rates
# # Filter data
# analysis_start_date = '2022-03-06'
# crypto_filtered = crypto_returns[crypto_returns['Date'] >= analysis_start_date].dropna()
# fx_filtered = fx_rates[fx_rates['Date'] >= analysis_start_date].dropna()
= crypto_returns
crypto_filtered = fx_rates
fx_filtered # Function for PCA Visualization (2 Components)
def plot_pca_scatter(data, title):
= StandardScaler()
scaler = scaler.fit_transform(data.drop(columns=['Date']))
data_scaled = PCA(n_components=2)
pca = pca.fit_transform(data_scaled)
principal_components = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
df ='PC1', y='PC2', data=df, alpha=0.6)
sns.scatterplot(xf'{title} (PCA with 2 Components)')
plt.title('Principal Component 1')
plt.xlabel('Principal Component 2')
plt.ylabel(True, linestyle='dashed')
plt.grid(
plt.show()
"Cryptocurrency Returns")
plot_pca_scatter(crypto_filtered, "Forex Percent Changes")
plot_pca_scatter(fx_filtered,
# Combined PCA Visualization
= pd.merge(crypto_filtered, fx_filtered, on='Date')
combined_data "Combined Market Analysis") plot_pca_scatter(combined_data,
Unsupervised Learning
Introduction
This page serves to illustrate the descriptions and technical details of the Unsupervised Machine Learning section. The objective of this section is to investigate the underlying patterns and cluster classes of the data through the use of downscaling and clustering techniques, with the aim of identifying suitable methods for categorizing the data and providing an analytical foundation for subsequent supervised learning.
Dimensionality Reduction
The level of data dimensionality has a direct impact on the efficiency of data analysis and modeling, particularly in the context of multivariate financial data. An excessive level of dimensionality may result in the introduction of an overwhelming amount of noise, thereby increasing the complexity of the model to a considerable extent. The application of dimensionality reduction techniques can effectively reduce the dimensionality of features while retaining as much pertinent information in the data as possible. This provides a more effective analytical basis for subsequent identification of data patterns and cluster classes.
In this section, two distinct methods for dimensionality reduction are employed: principal component analysis (PCA) and t-distributed stochastic neighbor embedding (tSNE). Principal component analysis is a linear dimensionality reduction technique that generates principal components by capturing the direction of the largest variance in the data, thereby transforming the data from a high-dimensional to a low-dimensional space. t-SNE is a nonlinear dimensionality reduction technique, which is capable of capturing complex nonlinear structures in the data. Furthermore, it is visualization-friendly and suitable for discovering local patterns in the data.
Principal component analysis (PCA)
The PCA technique necessitates the calculation of a covariance matrix, therefore it is essential that the data utilized be complete and devoid of any missing values. We have confirmed that there are no missing values in the data during data cleaning, so we can directly use the cleaned datasets. Furthermore, the selected timeframe is more focused on recent market conditions. Once the scope of the data had been confirmed, the decision was taken to apply the PCA technique to three aspects of the data: within the cryptocurrency market, within the FX market, and the cryptocurrency combined with the FX market.
A function was defined for the purpose of visualizing the first two principal components of the data, with the objective of demonstrating the state of the data in two-dimensional space in the form of a scatter plot. First, the data were normalized to prevent any bias in the results of the PCA analysis that might result from significant differences in the range of eigenvalues. All volatility columns, with the exception of the date column, were standardized using the StandardScaler function, with the objective of ensuring that the mean value of the data was 0 and the standard deviation was 1.
# Load cleaned datasets
= pd.read_csv('../../data/processed-data/weekly_crypto_returns.csv')
crypto_returns_w = pd.read_csv('../../data/processed-data/weekly_fx_rates.csv')
fx_rates_w
# # Filter data
# analysis_start_date = '2022-03-06'
# crypto_filtered = crypto_returns[crypto_returns['Date'] >= analysis_start_date].dropna()
# fx_filtered = fx_rates[fx_rates['Date'] >= analysis_start_date].dropna()
= crypto_returns_w
crypto_filtered_w = fx_rates_w
fx_filtered_w # Function for PCA Visualization (2 Components)
def plot_pca_scatter(data, title):
= StandardScaler()
scaler = scaler.fit_transform(data.drop(columns=['Date']))
data_scaled = PCA(n_components=2)
pca = pca.fit_transform(data_scaled)
principal_components = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
df ='PC1', y='PC2', data=df, alpha=0.6)
sns.scatterplot(xf'{title} (PCA with 2 Components)')
plt.title('Principal Component 1')
plt.xlabel('Principal Component 2')
plt.ylabel(True, linestyle='dashed')
plt.grid(
plt.show()
"Cryptocurrency Returns")
plot_pca_scatter(crypto_filtered_w, "Forex Percent Changes")
plot_pca_scatter(fx_filtered_w,
# Combined PCA Visualization
= pd.merge(crypto_filtered_w, fx_filtered_w, on='Date')
combined_data_w "Combined Market Analysis") plot_pca_scatter(combined_data_w,
In order to gain a deeper comprehension of the underlying significance of the principal components, we proceed to define the functions utilized for the calculation and visualization of the loadings. In order to accommodate the heightened complexity of the feature dimensions, a threshold of 80% for both markets and 70% for the composite market was established for the cumulative explained variance. In conclusion, the optimal principal component scores for each dataset have been derived. Subsequently, the loading matrices were generated and visualized in the form of heatmaps, thereby facilitating an understanding of the weighting of each feature in the principal components.
The preceding analysis demonstrates that the first principal component captures the majority of the variance in both cryptocurrency and FX data. However, the former exhibits a stronger correlation with intra-market volatility, while the latter demonstrates a relatively more even distribution of variance, indicating that the correlation within the FX market is lower. With regard to the aggregate market, the cryptocurrency feature tends to exert a greater influence on the initial principal components, whereas the reverse is true for the FX feature. These findings provide a more profound comprehension of the data structure, thereby establishing a foundation for the subsequent selection of classification criteria.
# Function to calculate and visualize PCA loadings
def plot_pca_loadings(data, title, threshold=0.8, n=10):
= StandardScaler()
scaler = scaler.fit_transform(data.drop(columns=['Date']))
data_scaled = PCA(n_components=n)
pca
pca.fit(data_scaled)
# Explained variance ratio
= pca.explained_variance_ratio_
explained_variance = explained_variance.cumsum()
cumulative_variance
# Find the optimal number of components
= (cumulative_variance >= threshold).argmax() + 1
optimal_components print(f"Optimal number of components: {optimal_components}")
# Plot cumulative explained variance
='o', color='blue')
plt.plot(cumulative_variance, marker=threshold, color='red', linestyle='dashed')
plt.axhline(yf'Cumulative Explained Variance for {title}')
plt.title('Number of Principal Components')
plt.xlabel('Cumulative Explained Variance')
plt.ylabel(
plt.legend()
plt.show()
# Create loadings DataFrame
= pd.DataFrame(
loadings_df =pca.components_,
data=data.drop(columns=['Date']).columns,
columns=[f'PC{i+1}' for i in range(pca.n_components_)]
index
)print(loadings_df)
# Visualization
=(15, 8))
plt.figure(figsize='coolwarm', annot=True, center=0, annot_kws={"size": 6})
sns.heatmap(loadings_df, cmapf'Principal Component Loadings for {title}')
plt.title('Features')
plt.xlabel('Principal Components')
plt.ylabel(
plt.show()
return loadings_df
"Cryptocurrency Returns", threshold=0.8, n=10)
plot_pca_loadings(crypto_filtered, "Forex Percent Changes", threshold=0.8, n=10)
plot_pca_loadings(fx_filtered, "Combined Market Analysis", threshold=0.7, n=15) plot_pca_loadings(combined_data,
Optimal number of components: 5
/tmp/ipykernel_876/2038098554.py:22: UserWarning: No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
plt.legend()
ADA-USD AVAX-USD BNB-USD BTC-USD DOGE-USD ETH-USD SOL-USD \
PC1 0.296632 0.285957 0.287717 0.307581 0.249772 0.319374 0.283420
PC2 -0.013821 0.022958 0.007132 -0.007617 0.009395 0.042817 0.035533
PC3 -0.029351 -0.104603 -0.072291 -0.093723 -0.046063 -0.174923 -0.049574
PC4 -0.124758 -0.055354 -0.030931 -0.004298 -0.201420 0.003510 -0.108936
PC5 0.293604 0.188648 -0.048568 -0.188073 -0.004386 -0.287701 0.184738
PC6 -0.094319 0.186643 0.058201 0.054227 0.044005 -0.021049 0.193899
PC7 0.029196 0.090038 0.075996 -0.093110 0.875694 -0.162135 -0.101363
PC8 0.114977 0.504772 0.139879 -0.024946 -0.294106 -0.159237 0.477494
PC9 0.058179 -0.092344 0.897895 0.011150 -0.156250 -0.097303 -0.343708
PC10 -0.518790 -0.448450 0.249129 -0.049626 0.119515 -0.026291 0.647813
STETH-USD TON11419-USD TRX-USD USDC-USD USDT-USD WSTETH-USD \
PC1 0.315330 0.177072 0.245007 0.017143 0.047644 0.317359
PC2 0.043212 0.069481 -0.186439 -0.668170 0.680873 0.043398
PC3 -0.183148 -0.108572 0.615553 -0.268637 0.143700 -0.179655
PC4 0.008744 0.916800 0.061802 0.062767 0.026808 0.011714
PC5 -0.305550 0.223331 -0.063434 0.307536 0.267609 -0.295966
PC6 -0.032557 -0.185124 0.054868 0.510850 0.555395 -0.016753
PC7 -0.181556 0.147828 -0.032754 -0.128329 -0.133523 -0.166048
PC8 -0.165740 -0.005921 -0.040954 -0.321779 -0.334003 -0.165231
PC9 -0.104160 -0.037472 -0.032015 0.018265 0.043721 -0.118195
PC10 -0.036299 0.060591 -0.021407 0.010437 -0.035704 -0.025396
WTRX-USD XRP-USD
PC1 0.239251 0.238329
PC2 -0.205001 0.024220
PC3 0.624222 0.005278
PC4 0.085653 -0.267200
PC5 -0.062180 0.564484
PC6 0.050411 -0.547147
PC7 -0.043163 -0.231460
PC8 -0.041720 -0.314035
PC9 -0.014934 0.003802
PC10 -0.000127 0.152014
Optimal number of components: 8
/tmp/ipykernel_876/2038098554.py:22: UserWarning: No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
plt.legend()
AUDUSD=X CNY=X EURUSD=X GBPUSD=X HKD=X IDR=X INR=X \
PC1 0.340517 0.182222 0.323001 0.317804 0.050662 0.237734 0.189237
PC2 -0.061051 0.273564 -0.032386 -0.014747 0.281045 0.019846 -0.139242
PC3 0.181102 -0.338390 0.129917 0.245033 0.442325 -0.366316 -0.306523
PC4 0.017034 -0.039713 -0.055557 0.005240 -0.604561 -0.019880 0.038438
PC5 -0.135564 -0.417332 -0.044897 -0.099649 0.441957 0.330939 0.303667
PC6 -0.067990 0.137726 0.173633 0.162191 0.098370 0.027948 0.556095
PC7 -0.006994 0.345126 -0.048207 -0.005372 0.133637 -0.079627 0.466008
PC8 -0.079313 0.559965 -0.159160 -0.179228 0.263414 0.274347 -0.345043
PC9 0.096932 0.322566 0.147600 0.180425 0.065235 -0.287435 -0.149290
PC10 0.084103 -0.120162 0.254253 0.162989 -0.142694 0.664750 -0.277827
JPY=X MXN=X MYR=X NZDUSD=X PHP=X RUB=X SGD=X \
PC1 0.241449 0.208690 0.082528 0.331358 0.196538 -0.036898 0.360333
PC2 0.216587 -0.416445 0.579128 -0.007704 0.042537 0.445725 0.051718
PC3 0.126096 0.023510 -0.276192 0.191184 -0.391277 0.245810 0.083593
PC4 0.284445 -0.173401 -0.357592 -0.001448 0.035184 0.607259 0.011755
PC5 0.019010 0.155224 -0.202595 -0.146848 0.431125 0.306714 -0.073302
PC6 0.121417 -0.522385 -0.318063 -0.008898 -0.231141 -0.230949 0.051384
PC7 -0.384023 0.308025 0.063755 -0.046602 -0.376113 0.423757 -0.058077
PC8 0.182437 0.107116 -0.478764 -0.139475 -0.106627 -0.049001 -0.045693
PC9 -0.456653 -0.160738 -0.265844 0.114058 0.580148 0.120734 0.012091
PC10 -0.446706 -0.114200 -0.004091 0.004069 -0.265113 0.140675 0.062957
THB=X ZAR=X
PC1 0.292692 0.276885
PC2 0.115187 -0.221794
PC3 -0.044069 -0.015605
PC4 0.085863 0.106253
PC5 0.094157 -0.131164
PC6 -0.038890 -0.316966
PC7 -0.112601 0.226829
PC8 0.219535 0.038814
PC9 -0.206793 -0.104624
PC10 -0.072010 -0.189712
Optimal number of components: 8
/tmp/ipykernel_876/2038098554.py:22: UserWarning: No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
plt.legend()
ADA-USD AVAX-USD BNB-USD BTC-USD DOGE-USD ETH-USD SOL-USD \
PC1 0.293498 0.286318 0.283867 0.302554 0.264117 0.313450 0.284729
PC2 0.023910 0.007172 0.008747 0.024679 0.010219 0.020394 0.014885
PC3 0.025144 0.052340 -0.035043 -0.060683 0.001964 0.004162 0.069824
PC4 0.034099 0.117451 0.027028 -0.000394 0.046188 0.016678 0.104651
PC5 -0.023264 -0.010634 0.049452 -0.013760 -0.008174 -0.032933 -0.010091
PC6 0.020011 0.010359 0.082189 0.003202 0.046633 0.030018 0.046906
PC7 0.044567 -0.018610 -0.052471 -0.135488 -0.002947 -0.211466 0.016134
PC8 0.004990 -0.051438 -0.010604 0.009497 0.024124 0.045051 -0.047575
PC9 -0.037142 -0.034844 0.048532 -0.017616 0.028581 0.018072 -0.094239
PC10 -0.082807 -0.002042 0.002176 0.024660 -0.011781 0.043105 -0.050601
PC11 -0.129423 -0.066216 -0.059895 0.006088 -0.131918 0.011669 -0.092803
PC12 -0.089889 -0.106287 -0.091285 0.048727 -0.179267 0.030423 -0.001798
PC13 0.088050 0.064018 0.087298 -0.069839 0.066598 -0.200459 0.043405
PC14 0.016483 -0.076288 0.015039 0.031086 -0.062088 0.045593 -0.046150
PC15 0.240628 0.008345 -0.136701 -0.138070 -0.050078 -0.148183 0.028831
STETH-USD TON11419-USD TRX-USD ... INR=X JPY=X MXN=X \
PC1 0.310049 0.180555 0.247578 ... -0.015043 0.005549 -0.013174
PC2 0.024689 -0.009249 0.024561 ... 0.188382 0.242862 0.207874
PC3 -0.000431 0.107995 -0.142471 ... 0.090685 -0.128589 0.194840
PC4 0.020667 -0.027092 -0.271873 ... -0.043682 0.156324 -0.348002
PC5 -0.032146 0.204946 0.039588 ... -0.313577 0.093884 0.032259
PC6 0.026910 -0.143199 -0.144754 ... 0.114491 0.249284 -0.093869
PC7 -0.214722 -0.187239 0.509634 ... -0.055563 0.074011 -0.032314
PC8 0.045682 -0.074653 -0.023516 ... 0.277797 -0.102909 0.276915
PC9 0.029231 0.374937 -0.075767 ... -0.175946 -0.300464 0.491072
PC10 0.042884 0.018877 0.055113 ... 0.654030 -0.285372 -0.047555
PC11 0.018639 0.618015 0.120617 ... 0.248870 0.033946 -0.304397
PC12 0.033237 0.321106 0.070703 ... -0.300171 0.071277 -0.000238
PC13 -0.217482 0.095777 0.025771 ... 0.236330 -0.012402 0.086806
PC14 0.046791 -0.135576 0.010801 ... -0.131433 -0.462037 -0.149482
PC15 -0.154514 0.382900 -0.116062 ... 0.011111 0.007384 -0.145232
MYR=X NZDUSD=X PHP=X RUB=X SGD=X THB=X ZAR=X
PC1 0.000500 -0.030228 -0.041416 0.009739 -0.019582 -0.022599 -0.014217
PC2 0.082700 0.329263 0.193899 -0.035931 0.359489 0.291548 0.276483
PC3 -0.156996 0.039123 -0.032544 -0.383910 -0.006589 -0.094063 0.011983
PC4 0.517526 0.000219 0.057875 0.285422 0.044168 0.084351 -0.240271
PC5 -0.162313 0.189350 -0.389081 0.248432 0.086925 -0.044957 -0.019593
PC6 -0.438223 0.011722 0.029122 0.567464 0.008241 0.061379 0.064367
PC7 0.190468 -0.017658 0.067139 0.119311 -0.004758 -0.007544 -0.009417
PC8 -0.147638 -0.136997 0.356112 0.155448 -0.079968 0.048166 -0.097822
PC9 0.368007 0.005059 0.047902 0.317227 -0.046052 -0.033366 0.311736
PC10 -0.045363 -0.021013 -0.433872 0.103756 -0.007395 -0.119605 -0.015108
PC11 0.117976 0.011677 0.255657 0.089511 0.043857 -0.014095 -0.211164
PC12 -0.452595 -0.108764 0.058270 0.072770 -0.026171 0.177643 -0.064024
PC13 -0.059687 -0.084642 0.011826 0.333458 -0.058484 -0.045011 0.207021
PC14 -0.149227 0.156768 0.519304 0.116639 0.032483 -0.311257 -0.112557
PC15 -0.073596 0.037388 -0.131617 -0.124500 0.038822 -0.167175 -0.015392
[15 rows x 31 columns]
ADA-USD | AVAX-USD | BNB-USD | BTC-USD | DOGE-USD | ETH-USD | SOL-USD | STETH-USD | TON11419-USD | TRX-USD | ... | INR=X | JPY=X | MXN=X | MYR=X | NZDUSD=X | PHP=X | RUB=X | SGD=X | THB=X | ZAR=X | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PC1 | 0.293498 | 0.286318 | 0.283867 | 0.302554 | 0.264117 | 0.313450 | 0.284729 | 0.310049 | 0.180555 | 0.247578 | ... | -0.015043 | 0.005549 | -0.013174 | 0.000500 | -0.030228 | -0.041416 | 0.009739 | -0.019582 | -0.022599 | -0.014217 |
PC2 | 0.023910 | 0.007172 | 0.008747 | 0.024679 | 0.010219 | 0.020394 | 0.014885 | 0.024689 | -0.009249 | 0.024561 | ... | 0.188382 | 0.242862 | 0.207874 | 0.082700 | 0.329263 | 0.193899 | -0.035931 | 0.359489 | 0.291548 | 0.276483 |
PC3 | 0.025144 | 0.052340 | -0.035043 | -0.060683 | 0.001964 | 0.004162 | 0.069824 | -0.000431 | 0.107995 | -0.142471 | ... | 0.090685 | -0.128589 | 0.194840 | -0.156996 | 0.039123 | -0.032544 | -0.383910 | -0.006589 | -0.094063 | 0.011983 |
PC4 | 0.034099 | 0.117451 | 0.027028 | -0.000394 | 0.046188 | 0.016678 | 0.104651 | 0.020667 | -0.027092 | -0.271873 | ... | -0.043682 | 0.156324 | -0.348002 | 0.517526 | 0.000219 | 0.057875 | 0.285422 | 0.044168 | 0.084351 | -0.240271 |
PC5 | -0.023264 | -0.010634 | 0.049452 | -0.013760 | -0.008174 | -0.032933 | -0.010091 | -0.032146 | 0.204946 | 0.039588 | ... | -0.313577 | 0.093884 | 0.032259 | -0.162313 | 0.189350 | -0.389081 | 0.248432 | 0.086925 | -0.044957 | -0.019593 |
PC6 | 0.020011 | 0.010359 | 0.082189 | 0.003202 | 0.046633 | 0.030018 | 0.046906 | 0.026910 | -0.143199 | -0.144754 | ... | 0.114491 | 0.249284 | -0.093869 | -0.438223 | 0.011722 | 0.029122 | 0.567464 | 0.008241 | 0.061379 | 0.064367 |
PC7 | 0.044567 | -0.018610 | -0.052471 | -0.135488 | -0.002947 | -0.211466 | 0.016134 | -0.214722 | -0.187239 | 0.509634 | ... | -0.055563 | 0.074011 | -0.032314 | 0.190468 | -0.017658 | 0.067139 | 0.119311 | -0.004758 | -0.007544 | -0.009417 |
PC8 | 0.004990 | -0.051438 | -0.010604 | 0.009497 | 0.024124 | 0.045051 | -0.047575 | 0.045682 | -0.074653 | -0.023516 | ... | 0.277797 | -0.102909 | 0.276915 | -0.147638 | -0.136997 | 0.356112 | 0.155448 | -0.079968 | 0.048166 | -0.097822 |
PC9 | -0.037142 | -0.034844 | 0.048532 | -0.017616 | 0.028581 | 0.018072 | -0.094239 | 0.029231 | 0.374937 | -0.075767 | ... | -0.175946 | -0.300464 | 0.491072 | 0.368007 | 0.005059 | 0.047902 | 0.317227 | -0.046052 | -0.033366 | 0.311736 |
PC10 | -0.082807 | -0.002042 | 0.002176 | 0.024660 | -0.011781 | 0.043105 | -0.050601 | 0.042884 | 0.018877 | 0.055113 | ... | 0.654030 | -0.285372 | -0.047555 | -0.045363 | -0.021013 | -0.433872 | 0.103756 | -0.007395 | -0.119605 | -0.015108 |
PC11 | -0.129423 | -0.066216 | -0.059895 | 0.006088 | -0.131918 | 0.011669 | -0.092803 | 0.018639 | 0.618015 | 0.120617 | ... | 0.248870 | 0.033946 | -0.304397 | 0.117976 | 0.011677 | 0.255657 | 0.089511 | 0.043857 | -0.014095 | -0.211164 |
PC12 | -0.089889 | -0.106287 | -0.091285 | 0.048727 | -0.179267 | 0.030423 | -0.001798 | 0.033237 | 0.321106 | 0.070703 | ... | -0.300171 | 0.071277 | -0.000238 | -0.452595 | -0.108764 | 0.058270 | 0.072770 | -0.026171 | 0.177643 | -0.064024 |
PC13 | 0.088050 | 0.064018 | 0.087298 | -0.069839 | 0.066598 | -0.200459 | 0.043405 | -0.217482 | 0.095777 | 0.025771 | ... | 0.236330 | -0.012402 | 0.086806 | -0.059687 | -0.084642 | 0.011826 | 0.333458 | -0.058484 | -0.045011 | 0.207021 |
PC14 | 0.016483 | -0.076288 | 0.015039 | 0.031086 | -0.062088 | 0.045593 | -0.046150 | 0.046791 | -0.135576 | 0.010801 | ... | -0.131433 | -0.462037 | -0.149482 | -0.149227 | 0.156768 | 0.519304 | 0.116639 | 0.032483 | -0.311257 | -0.112557 |
PC15 | 0.240628 | 0.008345 | -0.136701 | -0.138070 | -0.050078 | -0.148183 | 0.028831 | -0.154514 | 0.382900 | -0.116062 | ... | 0.011111 | 0.007384 | -0.145232 | -0.073596 | 0.037388 | -0.131617 | -0.124500 | 0.038822 | -0.167175 | -0.015392 |
15 rows × 31 columns
# Function to calculate and visualize PCA loadings
def plot_pca_loadings(data, title, threshold=0.8, n=10):
= StandardScaler()
scaler = scaler.fit_transform(data.drop(columns=['Date']))
data_scaled = PCA(n_components=n)
pca
pca.fit(data_scaled)
# Explained variance ratio
= pca.explained_variance_ratio_
explained_variance = explained_variance.cumsum()
cumulative_variance
# Find the optimal number of components
= (cumulative_variance >= threshold).argmax() + 1
optimal_components print(f"Optimal number of components: {optimal_components}")
# Plot cumulative explained variance
='o', color='blue')
plt.plot(cumulative_variance, marker=threshold, color='red', linestyle='dashed')
plt.axhline(yf'Cumulative Explained Variance for {title}')
plt.title('Number of Principal Components')
plt.xlabel('Cumulative Explained Variance')
plt.ylabel(
plt.legend()
plt.show()
# Create loadings DataFrame
= pd.DataFrame(
loadings_df =pca.components_,
data=data.drop(columns=['Date']).columns,
columns=[f'PC{i+1}' for i in range(pca.n_components_)]
index
)print(loadings_df)
# Visualization
=(15, 8))
plt.figure(figsize='coolwarm', annot=True, center=0, annot_kws={"size": 6})
sns.heatmap(loadings_df, cmapf'Principal Component Loadings for {title}')
plt.title('Features')
plt.xlabel('Principal Components')
plt.ylabel(
plt.show()
return loadings_df
"Cryptocurrency Returns", threshold=0.8, n=10)
plot_pca_loadings(crypto_filtered_w, "Forex Percent Changes", threshold=0.8, n=10)
plot_pca_loadings(fx_filtered_w, "Combined Market Analysis", threshold=0.7, n=15) plot_pca_loadings(combined_data_w,
Optimal number of components: 5
/tmp/ipykernel_876/222584158.py:22: UserWarning: No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
plt.legend()
ADA-USD AVAX-USD BNB-USD BTC-USD DOGE-USD ETH-USD SOL-USD \
PC1 0.285326 0.280062 0.294358 0.308639 0.266859 0.324924 0.289690
PC2 -0.080526 -0.112152 0.043546 0.044076 -0.041167 -0.019797 -0.112252
PC3 -0.156215 -0.234363 -0.096298 -0.070927 -0.222203 -0.080499 -0.045299
PC4 -0.191742 -0.157876 -0.054716 0.101797 -0.043764 0.160414 -0.156522
PC5 -0.028283 0.281692 0.041417 -0.079657 0.118880 -0.073134 -0.003050
PC6 0.075185 0.031726 -0.090544 -0.184821 0.082749 -0.214834 0.098076
PC7 -0.393735 -0.313993 0.408389 -0.124137 0.685289 -0.001551 -0.266450
PC8 -0.293880 -0.254856 0.012088 0.185527 -0.424622 0.259806 -0.228616
PC9 -0.301459 -0.164284 0.521860 -0.111451 -0.286825 -0.070718 0.693292
PC10 0.651972 -0.308483 0.525785 -0.145177 -0.223119 -0.032533 -0.328492
STETH-USD TON11419-USD TRX-USD USDC-USD USDT-USD WSTETH-USD \
PC1 0.325654 0.149173 0.228051 -0.071288 0.105561 0.323939
PC2 -0.033798 -0.127896 0.501534 0.536027 -0.368197 -0.029358
PC3 -0.065295 0.124754 0.392001 -0.427945 0.554228 -0.070427
PC4 0.149972 0.758618 -0.035796 -0.097237 -0.230232 0.164381
PC5 -0.077650 0.243853 -0.066102 0.591391 0.636915 -0.072145
PC6 -0.208193 0.542055 0.036871 0.137448 -0.191002 -0.221344
PC7 -0.001439 -0.060617 -0.024477 -0.051599 0.059396 -0.003064
PC8 0.250754 -0.045672 -0.131653 0.324330 0.159392 0.255614
PC9 -0.061231 0.038671 -0.038146 0.040889 -0.054006 -0.067025
PC10 -0.041531 0.090180 -0.061189 0.024574 0.052235 -0.058603
WTRX-USD XRP-USD
PC1 0.223830 0.214401
PC2 0.498268 -0.146114
PC3 0.420183 0.056113
PC4 -0.043325 -0.426816
PC5 -0.048946 -0.243785
PC6 0.045078 0.665911
PC7 -0.021316 0.112434
PC8 -0.140544 0.464829
PC9 -0.064114 -0.097772
PC10 -0.015993 -0.053596
Optimal number of components: 6
/tmp/ipykernel_876/222584158.py:22: UserWarning: No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
plt.legend()
AUDUSD=X CNY=X EURUSD=X GBPUSD=X HKD=X IDR=X INR=X \
PC1 0.305819 0.243538 0.296402 0.291292 0.089436 0.259496 0.217037
PC2 -0.011555 -0.248754 0.020892 -0.096929 -0.412892 0.201715 0.399651
PC3 0.002039 -0.011524 0.089480 -0.014143 0.639106 -0.082131 -0.019454
PC4 0.182923 -0.245004 -0.011573 0.042404 0.252546 -0.067170 -0.257369
PC5 -0.165015 0.091001 -0.281473 -0.366627 -0.089735 0.301096 -0.147826
PC6 -0.132458 -0.089271 0.030810 0.059623 0.355785 0.378345 0.533726
PC7 0.147092 0.546303 0.204918 0.108693 -0.334761 0.009549 0.027342
PC8 -0.177886 -0.138838 0.206487 0.258318 -0.016710 -0.327505 -0.030589
PC9 -0.045463 0.616026 -0.223705 -0.290616 0.278433 -0.226999 0.252861
PC10 0.233391 0.105950 -0.129777 -0.096716 0.068349 0.505322 -0.372600
JPY=X MXN=X MYR=X NZDUSD=X PHP=X RUB=X SGD=X \
PC1 0.247926 0.177005 0.234562 0.303036 0.252319 0.013253 0.327980
PC2 -0.111274 0.455604 -0.290794 -0.170159 0.065443 0.435821 -0.040202
PC3 -0.267038 0.083931 0.136955 0.036783 -0.100491 0.664845 -0.028241
PC4 -0.227062 0.570909 -0.044114 0.122420 -0.266902 -0.367809 -0.042736
PC5 0.095751 0.273648 0.596334 -0.242372 -0.170525 -0.019819 0.028830
PC6 -0.010086 0.099567 -0.130231 -0.051095 -0.153893 -0.393340 -0.062903
PC7 -0.416627 0.047887 0.018176 0.053350 -0.529040 0.020288 0.098198
PC8 0.563559 0.123170 -0.121626 -0.204067 -0.504153 0.093329 0.149640
PC9 0.164277 0.101673 -0.290175 -0.230124 0.033123 -0.050621 -0.037501
PC10 0.412754 0.041201 -0.290238 0.094287 -0.129375 0.208414 0.062041
THB=X ZAR=X
PC1 0.285548 0.243763
PC2 0.003028 0.167174
PC3 0.060818 -0.144911
PC4 -0.122245 0.390298
PC5 0.313616 -0.056704
PC6 -0.016208 -0.450759
PC7 -0.099063 -0.185496
PC8 0.230080 -0.061855
PC9 -0.012790 0.333526
PC10 -0.402877 -0.133155
Optimal number of components: 6
/tmp/ipykernel_876/222584158.py:22: UserWarning: No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
plt.legend()
ADA-USD AVAX-USD BNB-USD BTC-USD DOGE-USD ETH-USD SOL-USD \
PC1 0.226764 0.224876 0.208730 0.216392 0.177224 0.230066 0.192151
PC2 -0.159474 -0.152446 -0.204719 -0.219287 -0.204529 -0.227788 -0.223501
PC3 -0.049461 -0.085662 0.048229 0.006378 -0.023319 -0.022799 -0.070653
PC4 -0.065922 -0.069334 -0.078894 -0.043550 -0.040545 -0.040018 -0.064199
PC5 0.136102 0.148857 -0.020801 -0.141132 0.012142 -0.056339 0.114542
PC6 -0.088350 -0.016733 -0.012815 -0.059310 0.120095 -0.035347 -0.094689
PC7 -0.073672 -0.003403 -0.007049 0.104278 0.003868 0.164679 -0.089374
PC8 -0.133138 -0.066338 -0.098842 0.067870 -0.290981 0.058383 -0.120771
PC9 -0.051646 0.259649 0.159538 -0.033437 0.327209 -0.057639 0.018829
PC10 0.105291 0.147228 -0.012256 0.075779 0.143648 0.097920 -0.048381
PC11 0.131251 0.061925 -0.009688 0.124856 -0.123812 0.036735 0.043298
PC12 -0.203261 -0.258926 0.135326 0.019586 0.187213 0.090876 -0.136031
PC13 0.149999 0.140932 -0.250041 0.014923 -0.066966 -0.144929 0.252173
PC14 0.127017 0.380042 -0.339121 0.142484 -0.179833 -0.022761 -0.003795
PC15 0.228984 0.094378 0.118561 -0.108314 0.125630 -0.149718 -0.102573
STETH-USD TON11419-USD TRX-USD ... INR=X JPY=X MXN=X \
PC1 0.231308 0.101302 0.156727 ... 0.146533 0.159969 0.136733
PC2 -0.227291 -0.111653 -0.164996 ... 0.162002 0.196690 0.105989
PC3 -0.033916 -0.113782 0.418257 ... 0.170832 -0.000785 0.237534
PC4 -0.028202 0.307410 0.238813 ... -0.035364 -0.272601 0.112486
PC5 -0.042303 0.046552 -0.250262 ... 0.328299 -0.109931 0.332317
PC6 -0.034143 0.180702 -0.029995 ... -0.260500 -0.169816 0.531097
PC7 0.151292 0.609858 -0.219784 ... 0.093388 -0.221934 0.018578
PC8 0.070723 0.057471 0.133315 ... 0.019438 -0.053511 -0.016560
PC9 -0.071679 0.006081 -0.098799 ... -0.264828 0.162818 -0.076238
PC10 0.112015 -0.411314 -0.165922 ... 0.272873 -0.090893 0.134520
PC11 0.031730 -0.195307 -0.060551 ... -0.325017 -0.216398 -0.221067
PC12 0.076320 0.094346 -0.063957 ... 0.020415 0.218215 -0.039164
PC13 -0.143334 0.373577 -0.004895 ... 0.143775 0.104014 -0.311729
PC14 -0.038749 0.015745 0.016485 ... -0.408975 -0.005825 0.064063
PC15 -0.134544 0.128060 0.053786 ... 0.303869 -0.226032 -0.095705
MYR=X NZDUSD=X PHP=X RUB=X SGD=X THB=X ZAR=X
PC1 0.169002 0.221143 0.178111 -0.013814 0.221159 0.190147 0.161774
PC2 0.152752 0.196708 0.173683 0.047043 0.242791 0.215651 0.187912
PC3 -0.208223 -0.111014 0.064290 0.201655 -0.039401 0.032262 0.097551
PC4 0.114160 0.055736 -0.068910 0.495136 -0.028150 0.038656 -0.122354
PC5 -0.166591 -0.123403 -0.027714 0.431088 -0.018546 -0.054020 0.096295
PC6 0.070971 0.038348 -0.236911 -0.309996 -0.023774 -0.050603 0.409194
PC7 -0.099603 0.049944 -0.033079 -0.006141 -0.004190 0.096315 -0.044605
PC8 -0.417737 0.178687 0.034876 -0.302135 -0.006844 -0.281710 0.096363
PC9 0.048632 0.129688 -0.090826 0.113096 0.027971 0.040506 0.027325
PC10 -0.308234 0.107247 -0.057587 -0.055032 -0.070999 -0.122039 -0.126611
PC11 -0.165123 0.140209 0.168596 0.443202 -0.004504 -0.187574 0.353002
PC12 -0.014488 0.029572 0.611897 0.074942 -0.064554 0.088313 0.204203
PC13 -0.074672 -0.059578 0.151919 -0.090584 -0.104241 -0.072164 -0.035687
PC14 0.068629 -0.027488 0.224434 -0.039455 -0.070321 -0.032674 0.078585
PC15 -0.144504 0.031325 0.274734 -0.219231 -0.121804 -0.123602 0.229274
[15 rows x 31 columns]
ADA-USD | AVAX-USD | BNB-USD | BTC-USD | DOGE-USD | ETH-USD | SOL-USD | STETH-USD | TON11419-USD | TRX-USD | ... | INR=X | JPY=X | MXN=X | MYR=X | NZDUSD=X | PHP=X | RUB=X | SGD=X | THB=X | ZAR=X | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PC1 | 0.226764 | 0.224876 | 0.208730 | 0.216392 | 0.177224 | 0.230066 | 0.192151 | 0.231308 | 0.101302 | 0.156727 | ... | 0.146533 | 0.159969 | 0.136733 | 0.169002 | 0.221143 | 0.178111 | -0.013814 | 0.221159 | 0.190147 | 0.161774 |
PC2 | -0.159474 | -0.152446 | -0.204719 | -0.219287 | -0.204529 | -0.227788 | -0.223501 | -0.227291 | -0.111653 | -0.164996 | ... | 0.162002 | 0.196690 | 0.105989 | 0.152752 | 0.196708 | 0.173683 | 0.047043 | 0.242791 | 0.215651 | 0.187912 |
PC3 | -0.049461 | -0.085662 | 0.048229 | 0.006378 | -0.023319 | -0.022799 | -0.070653 | -0.033916 | -0.113782 | 0.418257 | ... | 0.170832 | -0.000785 | 0.237534 | -0.208223 | -0.111014 | 0.064290 | 0.201655 | -0.039401 | 0.032262 | 0.097551 |
PC4 | -0.065922 | -0.069334 | -0.078894 | -0.043550 | -0.040545 | -0.040018 | -0.064199 | -0.028202 | 0.307410 | 0.238813 | ... | -0.035364 | -0.272601 | 0.112486 | 0.114160 | 0.055736 | -0.068910 | 0.495136 | -0.028150 | 0.038656 | -0.122354 |
PC5 | 0.136102 | 0.148857 | -0.020801 | -0.141132 | 0.012142 | -0.056339 | 0.114542 | -0.042303 | 0.046552 | -0.250262 | ... | 0.328299 | -0.109931 | 0.332317 | -0.166591 | -0.123403 | -0.027714 | 0.431088 | -0.018546 | -0.054020 | 0.096295 |
PC6 | -0.088350 | -0.016733 | -0.012815 | -0.059310 | 0.120095 | -0.035347 | -0.094689 | -0.034143 | 0.180702 | -0.029995 | ... | -0.260500 | -0.169816 | 0.531097 | 0.070971 | 0.038348 | -0.236911 | -0.309996 | -0.023774 | -0.050603 | 0.409194 |
PC7 | -0.073672 | -0.003403 | -0.007049 | 0.104278 | 0.003868 | 0.164679 | -0.089374 | 0.151292 | 0.609858 | -0.219784 | ... | 0.093388 | -0.221934 | 0.018578 | -0.099603 | 0.049944 | -0.033079 | -0.006141 | -0.004190 | 0.096315 | -0.044605 |
PC8 | -0.133138 | -0.066338 | -0.098842 | 0.067870 | -0.290981 | 0.058383 | -0.120771 | 0.070723 | 0.057471 | 0.133315 | ... | 0.019438 | -0.053511 | -0.016560 | -0.417737 | 0.178687 | 0.034876 | -0.302135 | -0.006844 | -0.281710 | 0.096363 |
PC9 | -0.051646 | 0.259649 | 0.159538 | -0.033437 | 0.327209 | -0.057639 | 0.018829 | -0.071679 | 0.006081 | -0.098799 | ... | -0.264828 | 0.162818 | -0.076238 | 0.048632 | 0.129688 | -0.090826 | 0.113096 | 0.027971 | 0.040506 | 0.027325 |
PC10 | 0.105291 | 0.147228 | -0.012256 | 0.075779 | 0.143648 | 0.097920 | -0.048381 | 0.112015 | -0.411314 | -0.165922 | ... | 0.272873 | -0.090893 | 0.134520 | -0.308234 | 0.107247 | -0.057587 | -0.055032 | -0.070999 | -0.122039 | -0.126611 |
PC11 | 0.131251 | 0.061925 | -0.009688 | 0.124856 | -0.123812 | 0.036735 | 0.043298 | 0.031730 | -0.195307 | -0.060551 | ... | -0.325017 | -0.216398 | -0.221067 | -0.165123 | 0.140209 | 0.168596 | 0.443202 | -0.004504 | -0.187574 | 0.353002 |
PC12 | -0.203261 | -0.258926 | 0.135326 | 0.019586 | 0.187213 | 0.090876 | -0.136031 | 0.076320 | 0.094346 | -0.063957 | ... | 0.020415 | 0.218215 | -0.039164 | -0.014488 | 0.029572 | 0.611897 | 0.074942 | -0.064554 | 0.088313 | 0.204203 |
PC13 | 0.149999 | 0.140932 | -0.250041 | 0.014923 | -0.066966 | -0.144929 | 0.252173 | -0.143334 | 0.373577 | -0.004895 | ... | 0.143775 | 0.104014 | -0.311729 | -0.074672 | -0.059578 | 0.151919 | -0.090584 | -0.104241 | -0.072164 | -0.035687 |
PC14 | 0.127017 | 0.380042 | -0.339121 | 0.142484 | -0.179833 | -0.022761 | -0.003795 | -0.038749 | 0.015745 | 0.016485 | ... | -0.408975 | -0.005825 | 0.064063 | 0.068629 | -0.027488 | 0.224434 | -0.039455 | -0.070321 | -0.032674 | 0.078585 |
PC15 | 0.228984 | 0.094378 | 0.118561 | -0.108314 | 0.125630 | -0.149718 | -0.102573 | -0.134544 | 0.128060 | 0.053786 | ... | 0.303869 | -0.226032 | -0.095705 | -0.144504 | 0.031325 | 0.274734 | -0.219231 | -0.121804 | -0.123602 | 0.229274 |
15 rows × 31 columns
t-distributed stochastic neighbor embedding (t-SNE)
Following an initial trial of linear techniques, a nonlinear t-SNE approach was employed to facilitate the exploration of data relationships and patterns. To obtain a more comprehensive understanding, separate analysis were conducted with perplexity values of 5, 50, and 200. Similarly, following data normalization, the data were reduced to two dimensions and visualized in the form of scatter plots. As can be observed, the application of multiple perplexity still results in the absence of discernible intermarket patterns.
# Import necessary packages
from sklearn.manifold import TSNE
# Function to apply t-SNE
def apply_tsne(data, title, perplexities=[5, 50, 200]):
= StandardScaler()
scaler = scaler.fit_transform(data.drop(columns=['Date']))
data_scaled for perplexity in perplexities:
= TSNE(n_components=2, perplexity=perplexity, random_state=22)
tsne = tsne.fit_transform(data_scaled)
tsne_result
# Visualization
= pd.DataFrame(
tsne_df =tsne_result,
data=['TSNE1', 'TSNE2']
columns
)
='TSNE1', y='TSNE2', data=tsne_df)
sns.scatterplot(xf'{title} (t-SNE, Perplexity={perplexity})')
plt.title('t-SNE Component 1')
plt.xlabel('t-SNE Component 2')
plt.ylabel(True, linestyle='dashed', alpha=0.7)
plt.grid(
plt.show()
"Cryptocurrency Returns")
apply_tsne(crypto_filtered, "Forex Percent Changes")
apply_tsne(fx_filtered, "Combined Market Analysis") apply_tsne(combined_data,
# Function to apply t-SNE
def apply_tsne(data, title, perplexities=[5, 40, 80]):
= StandardScaler()
scaler = scaler.fit_transform(data.drop(columns=['Date']))
data_scaled for perplexity in perplexities:
= TSNE(n_components=2, perplexity=perplexity, random_state=22)
tsne = tsne.fit_transform(data_scaled)
tsne_result
# Visualization
= pd.DataFrame(
tsne_df =tsne_result,
data=['TSNE1', 'TSNE2']
columns
)
='TSNE1', y='TSNE2', data=tsne_df)
sns.scatterplot(xf'{title} (t-SNE, Perplexity={perplexity})')
plt.title('t-SNE Component 1')
plt.xlabel('t-SNE Component 2')
plt.ylabel(True, linestyle='dashed', alpha=0.7)
plt.grid(
plt.show()
"Cryptocurrency Returns")
apply_tsne(crypto_filtered_w, "Forex Percent Changes")
apply_tsne(fx_filtered_w, "Combined Market Analysis") apply_tsne(combined_data_w,
Evaluation and Comparison
With regard to preserving data structure, PCA exhibited superior performance, as evidenced by its capacity to illustrate the distribution of variance. Conversely, the implementation of t-SNE in the present study did not permit the observation of notable inter-market clustering. Moreover, the two methods differed in their visualisation focus. In conclusion, the joint application of the two methods offers a more comprehensive analytical perspective for subsequent unsupervised learning analysis.
Clustering
The application of clustering techniques can facilitate the exploration of underlying patterns within datasets. The identification of cluster structures may, in turn, facilitate a more profound understanding of market dynamics and the analysis of data for groupable qualities. Three clustering approaches were employed: K-means, DBSCAN and hierarchical clustering.
K-Means
The K-means algorithm is employed to partition clusters through the calculation and minimization of the distance between data points and their nearest clustering centres. The process commences with the random selection of k initial centroids, which are then iterated with continuous updating until no further changes are observed. A combination of the elbow method and the Silhouette score method is employed as a means of selecting an appropriate model. The elbow method is a technique for identifying the optimal value of k at the inflection point of the curve. This is achieved by calculating the inertia values for various values of k and plotting the resulting curve. The Silhouette score method is a method for evaluating the fitness of a model. This is done by calculating Silhouette scores (ranging from -1 to 1) for each data point at different values of k. The higher the score, the more accurately the clusters are categorized.
The elbow method curve was used to determine that the number of clusters that may be appropriate is 2, 3, 4, 5, and 6. A Silhouette score analysis was then performed on these k values, which demonstrated that the profile coefficients were relatively high for k=3 for daily data and k=2 for weekly data. However, the difference was not statistically significant, and no discernible cluster boundaries could be observed in the cluster scatterplot. This further suggests that the volatility data of the two markets most likely do not have a clear linear grouping.
# Import necessary packages
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.cm as cm
# Prepare PCA-Reduced Data
= StandardScaler()
scaler = scaler.fit_transform(combined_data.drop(columns=['Date']))
data_scaled = PCA(n_components=8)
pca = pca.fit_transform(data_scaled)
data_pca
# Elbow method
def elbow_method(data, max_k=15):
= []
inertias = range(1, max_k+1)
ks for k in ks:
= KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans
kmeans.fit(data)
inertias.append(kmeans.inertia_)
='o', color='b')
plt.plot(ks, inertias, marker'Number of Clusters (k)')
plt.xlabel('Inertia')
plt.ylabel('Elbow Method for Optimal k')
plt.title(True, linestyle='dashed', alpha=0.7)
plt.grid(
plt.show()
# Silhouette score
def silhouette(data, range_n_clusters):
for n_clusters in range_n_clusters:
= plt.subplots(1, 2, figsize=(15, 5))
fig, (ax1, ax2)
= KMeans(n_clusters=n_clusters, random_state=22, n_init=10)
clusterer = clusterer.fit_predict(data)
cluster_labels = silhouette_score(data, cluster_labels)
silhouette_avg print(f"For n_clusters = {n_clusters}, the average silhouette_score is: {silhouette_avg}")
= silhouette_samples(data, cluster_labels)
sample_silhouette_values
# Silhouette plot
= 10
y_lower -0.1, 1])
ax1.set_xlim([0, len(data) + (n_clusters + 1) * 10])
ax1.set_ylim([for i in range(n_clusters):
= sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values
ith_cluster_silhouette_values.sort()= ith_cluster_silhouette_values.shape[0]
size_cluster_i = y_lower + size_cluster_i
y_upper
= cm.nipy_spectral(float(i) / n_clusters)
color
ax1.fill_betweenx(np.arange(y_lower, y_upper),0, ith_cluster_silhouette_values,
=color, edgecolor=color, alpha=0.7)
facecolor-0.05, y_lower + 0.5 * size_cluster_i, str(i))
ax1.text(= y_upper + 10
y_lower
"Silhouette Plot")
ax1.set_title("Silhouette Coefficient Values")
ax1.set_xlabel("Cluster Label")
ax1.set_ylabel(=silhouette_avg, color="red", linestyle="--")
ax1.axvline(x
ax1.set_yticks([])0, 0.2, 0.4, 0.6, 0.8, 1])
ax1.set_xticks([
# 2nd plot
= cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
colors 0], data_scaled[:, 1], marker='o', s=30, c=colors, alpha=0.7, edgecolor='k')
ax2.scatter(data_scaled[:, f"Cluster Visualization (k = {n_clusters})")
ax2.set_title("Feature Space for PC1")
ax2.set_xlabel("Feature Space for PC2")
ax2.set_ylabel(
f"Silhouette Analysis for KMeans with n_clusters = {n_clusters}",
plt.suptitle(=14, fontweight='bold')
fontsize
plt.show()
# Run both analyses
=15)
elbow_method(data_pca, max_k=[2, 3, 4, 5, 6]) silhouette(data_pca, range_n_clusters
For n_clusters = 2, the average silhouette_score is: 0.2033246768090897
For n_clusters = 3, the average silhouette_score is: 0.21370316647109985
For n_clusters = 4, the average silhouette_score is: 0.213993002884968
For n_clusters = 5, the average silhouette_score is: 0.18024330461240276
For n_clusters = 6, the average silhouette_score is: 0.1690459620787759
# Prepare PCA-Reduced Data
= StandardScaler()
scaler = scaler.fit_transform(combined_data_w.drop(columns=['Date']))
data_scaled_w = PCA(n_components=6)
pca = pca.fit_transform(data_scaled_w)
data_pca_w
# Elbow method
def elbow_method(data, max_k=15):
= []
inertias = range(1, max_k+1)
ks for k in ks:
= KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans
kmeans.fit(data)
inertias.append(kmeans.inertia_)
='o', color='b')
plt.plot(ks, inertias, marker'Number of Clusters (k)')
plt.xlabel('Inertia')
plt.ylabel('Elbow Method for Optimal k')
plt.title(True, linestyle='dashed', alpha=0.7)
plt.grid(
plt.show()
# Silhouette score
def silhouette(data, range_n_clusters):
for n_clusters in range_n_clusters:
= plt.subplots(1, 2, figsize=(15, 5))
fig, (ax1, ax2)
= KMeans(n_clusters=n_clusters, random_state=22, n_init=10)
clusterer = clusterer.fit_predict(data)
cluster_labels = silhouette_score(data, cluster_labels)
silhouette_avg print(f"For n_clusters = {n_clusters}, the average silhouette_score is: {silhouette_avg}")
= silhouette_samples(data, cluster_labels)
sample_silhouette_values
# Silhouette plot
= 10
y_lower -0.1, 1])
ax1.set_xlim([0, len(data) + (n_clusters + 1) * 10])
ax1.set_ylim([for i in range(n_clusters):
= sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values
ith_cluster_silhouette_values.sort()= ith_cluster_silhouette_values.shape[0]
size_cluster_i = y_lower + size_cluster_i
y_upper
= cm.nipy_spectral(float(i) / n_clusters)
color
ax1.fill_betweenx(np.arange(y_lower, y_upper),0, ith_cluster_silhouette_values,
=color, edgecolor=color, alpha=0.7)
facecolor-0.05, y_lower + 0.5 * size_cluster_i, str(i))
ax1.text(= y_upper + 10
y_lower
"Silhouette Plot")
ax1.set_title("Silhouette Coefficient Values")
ax1.set_xlabel("Cluster Label")
ax1.set_ylabel(=silhouette_avg, color="red", linestyle="--")
ax1.axvline(x
ax1.set_yticks([])0, 0.2, 0.4, 0.6, 0.8, 1])
ax1.set_xticks([
# 2nd plot
= cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
colors 0], data_scaled[:, 1], marker='o', s=30, c=colors, alpha=0.7, edgecolor='k')
ax2.scatter(data_scaled[:, f"Cluster Visualization (k = {n_clusters})")
ax2.set_title("Feature Space for PC1")
ax2.set_xlabel("Feature Space for PC2")
ax2.set_ylabel(
f"Silhouette Analysis for KMeans with n_clusters = {n_clusters}",
plt.suptitle(=14, fontweight='bold')
fontsize
plt.show()
# Run both analyses
=15)
elbow_method(data_pca_w, max_k=[2, 3, 4, 5, 6]) silhouette(data_pca_w, range_n_clusters
For n_clusters = 2, the average silhouette_score is: 0.2467938250235648
For n_clusters = 3, the average silhouette_score is: 0.22325228349628845
For n_clusters = 4, the average silhouette_score is: 0.1855574332777159
For n_clusters = 5, the average silhouette_score is: 0.19874340001005872
For n_clusters = 6, the average silhouette_score is: 0.23421955294037305
DBSCAN
The DBSCAN technique is employed to identify clusters and noise points through the utilization of density connections. In order to determine the search radius and the minimum number of points in the clusters, two parameters are required: the domain radius, denoted as EPS, and the minimum number of samples. Similarly, the Silhouette Score method was employed for the purpose of tuning the parameters and optimizing the model.
A variety of parameter ranges were tested, resulting in the identification of the optimal parameter settings and model. Despite the Silhouette Score of 0.7224 for daily data and 0.6008 for weekly data, the visualization indicates that the clustering is not particularly effective, which may suggest that it is challenging to identify meaningful groupings for the data from the two markets under this dimension.
from sklearn.cluster import DBSCAN
# Optimize DBSCAN with Silhouette Score
def maximize_silhouette(X, eps_range, min_samples_range, i_plot=False):
= np.ascontiguousarray(X)
X print(f"Data shape: {X.shape}")
= []
params = []
sil_scores = -10
sil_max = None
opt_eps = None
opt_min_samples = None
opt_labels
for eps in eps_range:
for min_samples in min_samples_range:
= DBSCAN(eps=eps, min_samples=min_samples).fit(X)
model = model.labels_
labels
try:
= silhouette_score(X, labels)
sil_score
sil_scores.append(sil_score)
params.append((eps, min_samples))
if sil_score > sil_max:
= eps
opt_eps = min_samples
opt_min_samples = sil_score
sil_max = labels
opt_labels except ValueError:
continue
print(f"Optimal EPS: {opt_eps}")
print(f"Optimal Min Samples: {opt_min_samples}")
print(f"Maximum Silhouette Score: {sil_max:.4f}")
if i_plot:
plt.figure()*zip(*params), c=sil_scores, cmap="viridis")
plt.scatter(='Silhouette Score')
plt.colorbar(label'EPS')
plt.xlabel('Min Samples')
plt.ylabel('Silhouette Scores for DBSCAN')
plt.title(
plt.show()
return opt_eps, opt_min_samples, opt_labels
# DBSCAN on PCA-Reduced Data
= np.arange(6, 12, 0.2)
eps_range = range(8, 24)
min_samples_range = maximize_silhouette(
opt_eps, opt_min_samples, opt_labels
data_pca,=eps_range,
eps_range=min_samples_range,
min_samples_range=True
i_plot
)
# Visualization
0], data_pca[:, 1], c=opt_labels)
plt.scatter(data_pca[:, f'DBSCAN Clustering (EPS={opt_eps}, Min Samples={opt_min_samples})')
plt.title('Principal Component 1')
plt.xlabel('Principal Component 2')
plt.ylabel( plt.show()
Data shape: (455, 8)
Optimal EPS: 9.400000000000002
Optimal Min Samples: 8
Maximum Silhouette Score: 0.7224
# Optimize DBSCAN with Silhouette Score
def maximize_silhouette(X, eps_range, min_samples_range, i_plot=False):
= np.ascontiguousarray(X)
X print(f"Data shape: {X.shape}")
= []
params = []
sil_scores = -10
sil_max = None
opt_eps = None
opt_min_samples = None
opt_labels
for eps in eps_range:
for min_samples in min_samples_range:
= DBSCAN(eps=eps, min_samples=min_samples).fit(X)
model = model.labels_
labels
try:
= silhouette_score(X, labels)
sil_score
sil_scores.append(sil_score)
params.append((eps, min_samples))
if sil_score > sil_max:
= eps
opt_eps = min_samples
opt_min_samples = sil_score
sil_max = labels
opt_labels except ValueError:
continue
print(f"Optimal EPS: {opt_eps}")
print(f"Optimal Min Samples: {opt_min_samples}")
print(f"Maximum Silhouette Score: {sil_max:.4f}")
if i_plot:
plt.figure()*zip(*params), c=sil_scores, cmap="viridis")
plt.scatter(='Silhouette Score')
plt.colorbar(label'EPS')
plt.xlabel('Min Samples')
plt.ylabel('Silhouette Scores for DBSCAN')
plt.title(
plt.show()
return opt_eps, opt_min_samples, opt_labels
# DBSCAN on PCA-Reduced Data
= np.arange(3, 7, 0.2)
eps_range = range(6, 24)
min_samples_range = maximize_silhouette(
opt_eps, opt_min_samples, opt_labels
data_pca_w,=eps_range,
eps_range=min_samples_range,
min_samples_range=True
i_plot
)
# Visualization
0], data_pca[:, 1], c=opt_labels)
plt.scatter(data_pca[:, f'DBSCAN Clustering (EPS={opt_eps}, Min Samples={opt_min_samples})')
plt.title('Principal Component 1')
plt.xlabel('Principal Component 2')
plt.ylabel( plt.show()
Data shape: (91, 6)
Optimal EPS: 6.200000000000003
Optimal Min Samples: 6
Maximum Silhouette Score: 0.6008
Hierarchical Clustering
Hierarchical clustering techniques facilitate the generation of tree-structured data hierarchies through recursive processes, obviating the need for input parameters such as the number of clusters. The Ward clustering algorithm was employed, and the silhouette score method was used to evaluate the model performance.
It can be observed that the silhouette scores attain relatively high values for n_cluster=3; nevertheless, the clustering effect remains inadequate.
# Import necessary packages
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
# Perform hierarchical clustering
= linkage(data_pca, method='ward')
linkage_matrix def analyze_clusters(data, linkage_matrix, range_n_clusters):
for n_clusters in range_n_clusters:
= fcluster(linkage_matrix, n_clusters, criterion='maxclust')
cluster_labels = silhouette_score(data, cluster_labels)
silhouette_avg print(f"For n_clusters = {n_clusters}, the average silhouette_score is: {silhouette_avg}")
# Visualization
=(8, 6))
plt.figure(figsize=data[:, 0], y=data[:, 1], hue=cluster_labels)
sns.scatterplot(xf"Hierarchical Clustering Results (n_clusters = {n_clusters})")
plt.title("Principal Component 1")
plt.xlabel("Principal Component 2")
plt.ylabel(
plt.legend()
plt.show()=range(3, 9))
analyze_clusters(data_pca, linkage_matrix, range_n_clusters
# Plot optimal dendrogram (3)
def plot_dendrogram(linkage_matrix):
=(10, 6))
plt.figure(figsize='level', p=3)
dendrogram(linkage_matrix, truncate_mode"Hierarchical Clustering Dendrogram")
plt.title("Sample Index")
plt.xlabel("Distance")
plt.ylabel(
plt.show() plot_dendrogram(linkage_matrix)
For n_clusters = 3, the average silhouette_score is: 0.1625572716065391
For n_clusters = 4, the average silhouette_score is: 0.16550367196498164
For n_clusters = 5, the average silhouette_score is: 0.15334009659931447
For n_clusters = 6, the average silhouette_score is: 0.1548185914576616
For n_clusters = 7, the average silhouette_score is: 0.1567283602939523
For n_clusters = 8, the average silhouette_score is: 0.14331080686453992
# Perform hierarchical clustering
= linkage(data_pca_w, method='ward')
linkage_matrix_w def analyze_clusters(data, linkage_matrix, range_n_clusters):
for n_clusters in range_n_clusters:
= fcluster(linkage_matrix, n_clusters, criterion='maxclust')
cluster_labels = silhouette_score(data, cluster_labels)
silhouette_avg print(f"For n_clusters = {n_clusters}, the average silhouette_score is: {silhouette_avg}")
# Visualization
=(8, 6))
plt.figure(figsize=data[:, 0], y=data[:, 1], hue=cluster_labels)
sns.scatterplot(xf"Hierarchical Clustering Results (n_clusters = {n_clusters})")
plt.title("Principal Component 1")
plt.xlabel("Principal Component 2")
plt.ylabel(
plt.legend()
plt.show()=range(3, 9))
analyze_clusters(data_pca_w, linkage_matrix_w, range_n_clusters
# Plot optimal dendrogram (3)
def plot_dendrogram(linkage_matrix):
=(10, 6))
plt.figure(figsize='level', p=3)
dendrogram(linkage_matrix, truncate_mode"Hierarchical Clustering Dendrogram")
plt.title("Sample Index")
plt.xlabel("Distance")
plt.ylabel(
plt.show() plot_dendrogram(linkage_matrix_w)
For n_clusters = 3, the average silhouette_score is: 0.24042273382355286
For n_clusters = 4, the average silhouette_score is: 0.20099230051589648
For n_clusters = 5, the average silhouette_score is: 0.2116560825886306
For n_clusters = 6, the average silhouette_score is: 0.16203171542422257
For n_clusters = 7, the average silhouette_score is: 0.16757694754407834
For n_clusters = 8, the average silhouette_score is: 0.16776830800478731
Conclusion
By comparing the results of the above three clustering methods we can find that the combined performance of K-means is relatively good, while the results of DBSCAN show that the data does not have much density clustering properties. Overall, we all have difficulty in recognizing clear grouping patterns in the combined market data. This suggests that the data still lacks clear boundary separation after dimensionality reduction.
From a practical application point of view, we can assume that the volatility of the cryptocurrency market and the foreign exchange market in general has no obvious direct correlation. The volatility of the two may be affected by a large number of complex factors, which makes it difficult to find patterns through simple dimensionality reduction and clustering methods, which is also in line with the nature of the financial market data that is difficult to predict. In the subsequent supervised learning analysis, we will try to group the results based on the PCA in this section to introduce more features and observe whether there are localized volatility correlations.