import pandas as pdclass DateModifier:
def modified(self, series_1: pd.DataFrame, series_2: pd.DataFrame):
min_datetime_1, max_datetime_1 = series_1.index.min(), series_1.index.max()
min_datetime_2, max_datetime_2 = series_2.index.min(), series_2.index.max()
start_datetime = max(min_datetime_1, min_datetime_2)
end_datetime = min(max_datetime_1, max_datetime_2)
modified_series_1 = series_1[(start_datetime <= series_1.index) &
(series_1.index <= end_datetime)]
modified_series_2 = series_2[(start_datetime <= series_2.index) &
(series_2.index <= end_datetime)]
return self.__time_modification(modified_series_1, modified_series_2)
@classmethod
def __time_modification(cls, df_1: pd.DataFrame, df_2: pd.DataFrame):
common_index = pd.date_range(begin=min(df_1.index.min(), df_2.index.min()),
finish=max(df_1.index.max(), df_2.index.max()),
freq='h')
# Reindex and interpolate the sequence to align them on the widespread index
series1_aligned = df_1.reindex(common_index).infer_objects(copy=False).interpolate(technique='time')
series2_aligned = df_2.reindex(common_index).infer_objects(copy=False).interpolate(technique='time')
series1_daily = series1_aligned.ffill().bfill()
series2_daily = series2_aligned.ffill().bfill()
return series1_daily, series2_daily
- Dimensionality Discount: Methods like Principal Part Evaluation (PCA) assist scale back the complexity of information by eliminating redundant options. This not solely hurries up mannequin coaching but additionally helps in avoiding overfitting.
from matplotlib import pyplot as plt
from sklearn.decomposition import PCAclass PCAFeatureReducer:
@classmethod
def reduce_feature(cls, information, n_components=3):
pca = PCA(n_components=n_components)
reduced_features = pca.fit_transform(information)
return reduced_features
@classmethod
def plot_pca_and_close(cls, information, reduced_features):
plt.determine(figsize=(14, 10))
# Plotting the Shut Worth
plt.plot(information.index, information['Close'], label='Shut Worth', coloration='blue')
# Plotting the PCA diminished options
for i in vary(reduced_features.form[1]):
plt.plot(information.index, reduced_features[:, i], label=f'PCA Part {i + 1}')
plt.title('Shut Worth with PCA-Decreased Options')
plt.xlabel('Date')
plt.ylabel('Worth')
plt.legend()
plt.present()
- Standardization and Normalization: Guaranteeing that information is on a constant scale is vital, particularly when evaluating time sequence information. Normalization helps to keep up uniformity in information scales, making comparisons extra significant.
import pandas as pdclass DataNormalizer:
@classmethod
def normalize(cls, data_series: pd.Collection) -> pd.Collection:
return (data_series - data_series.min()) / (data_series.max() - data_series.min())
- Dealing with Lacking Information: Methods like Imputer and Easy Imputer assist fill in lacking values utilizing strategies equivalent to imply, median, or nearest neighbors. This step is important for making certain full datasets for correct evaluation.
from sklearn.impute import SimpleImputerclass Impute:
def __init__(self):
self.__impute_strategy = 'imply'
def fill_missing(self, information):
# Impute lacking values
impute = SimpleImputer(technique=self.__impute_strategy)
imputed_ta_features = impute.fit_transform(information)
return imputed_ta_features
- Noise Discount: Monetary information typically include random noise that may skew evaluation. Making use of Gaussian filters clean the information, take away pointless fluctuations, and enhance the accuracy of time sequence comparisons utilizing strategies like Dynamic Time Warping (DTW).
import pandas as pd
from scipy.ndimage import gaussian_filter1dclass NoiseReducer:
def __init__(self):
self.__sigma = 2 # Customary deviation for Gaussian kernel
def scale back(self, candle) -> pd.DataFrame:
new_candle = candle
new_candle['Open'] = gaussian_filter1d(candle['Open'], sigma=self.__sigma)
new_candle['High'] = gaussian_filter1d(candle['High'], sigma=self.__sigma)
new_candle['Low'] = gaussian_filter1d(candle['Low'], sigma=self.__sigma)
new_candle['Close'] = gaussian_filter1d(candle['Close'], sigma=self.__sigma)
return new_candle
On this examine, preprocessing steps embrace figuring out and eradicating inactive symbols, standardizing information, managing lacking values, and making use of Gaussian filters to scale back noise. These processes make sure that the information is clear, constant, and prepared for detailed evaluation. We collect the entire procedures of our information preprocessing in a category with the identical title.
class PreProcessor:
def __init__(self, plot: bool = False, time_windows_day: int = 7):
self.__windows_hours: int = time_windows_day * 24
self.__date_modifier = DateModifier()
self.__impute = Impute()
self.__noise_reducer = NoiseReducer()
self.__pca_feature_reducer = PCAFeatureReducer()
self.__plot = plot
self.__plotter = StockTrendPlotter(num_columns=3)def pre_process(self, df_1, df_2):
time_modified_series_1, time_modified_series_2 = self.__date_modifier.modified(df_1, df_2)
noise_reduced_series_1 = self.__noise_reducer.scale back(time_modified_series_1)
noise_reduced_series_2 = self.__noise_reducer.scale back(time_modified_series_2)
if self.__plot:
self.__plot_preprocessing(df_1, time_modified_series_1, noise_reduced_series_1)
self.__plot_preprocessing(df_2, time_modified_series_2, noise_reduced_series_2)
return noise_reduced_series_1, noise_reduced_series_2
@classmethod
def normalize(cls, sequence):
return DataNormalizer.normalize(sequence)
def fill_na(self, information):
return self.__impute.fill_missing(information)
def reduced_features(self, data_array, pca_component=3):
return self.__pca_feature_reducer.reduce_feature(data_array, pca_component)
def __plot_preprocessing(self, sequence, time_modified_series, noise_reduced_series):
sequence = {'Unique Collection': sequence,
'time_modified_series': time_modified_series,
'noise_reduced_series': noise_reduced_series}
self.__plotter.plot(candle_series=sequence,
column_name='Shut',
x_label='Time',
y_label='Worth',
title='PreProcessing')
The necessary step is Characteristic Extraction in Inventory Evaluation and Technical Indicators. Dynamic Time Warping (DTW) is broadly used for figuring out comparable patterns in inventory worth information with minimal error. It helps in recognizing repetitive worth patterns and predicting time sequence tendencies. Nonetheless, for detecting shares with comparable tendencies to reinforce liquidity in our asset portfolio, worth information alone won’t present a complete view of present market situations. It is because worth information, influenced by human habits and market feelings, can exhibit noisy variations.
On this challenge, the sample detection framework may be improved by incorporating monetary indicators to filter out noise and higher characterize time sequence dynamics. Whereas the core of the sample detection technique stays DTW, it now consists of varied monetary indicators.
The strategy assumes that distinct time sequence (or shares) may be thought-about substitutes if their habits aligns when it comes to each monetary indicators and worth tendencies. Technical indicators broadly utilized in monetary market evaluation can be included. These indicators, equivalent to shifting averages, Bollinger Bands, and Ichimoku Cloud, assist in forecasting future worth actions based mostly on historic information. Listed below are some key indicators:
- Ichimoku Cloud:
- Conversion Line: Exhibits short-term tendencies and helps determine key worth turning factors.
- Senkou Span A & B: These intersecting strains assist detect market tendencies and decide assist and resistance zones.
- Kijun-sen: Represents the typical worth vary over a interval and aids in pattern detection and commerce entry/exit factors.
- Transferring Averages:
- Easy Transferring Common (SMA): Consists of quick (short-term) and sluggish (long-term) shifting averages to determine worth tendencies.
- Exponential Transferring Common (EMA): Makes use of a weighted common to detect speedy and intermediate worth modifications.
- KAMA (Kaufman’s Adaptive Transferring Common): Adjusts for worth volatility and course, serving to determine fast worth modifications.
- Parabolic Cease and Reverse (PSAR): Identifies potential worth course reversals by SAR factors.
- Quantity Weighted Common Worth (VWAP): Determines honest worth ranges based mostly on buying and selling quantity.
- Volatility Indicators:
- Keltner Channel: Consists of decrease, center, and higher strains to measure volatility.
- Donchian Channel: Includes center and higher strains for volatility measurement.
- Bollinger Bands: Options decrease and center strains to evaluate market volatility.
- Efficiency Metrics:
- Cumulative Return: Measures the general return of a inventory over a sequence of intervals.
- Adjusted Shut: Gives the ultimate worth of an asset for technical evaluation and varied computations.
All of those indicators can calculated by the ta
module.
import pandas as pd
from matplotlib import pyplot as plt
from ta import pattern, momentum, volatility, quantity, othersclass TechnicalFeatures:
@classmethod
def add_ta_features(cls, information):
information['trend_ichimoku_conv'] = pattern.ichimoku_a(information['High'], information['Low'])
information['trend_ema_slow'] = pattern.ema_indicator(information['Close'], 50)
information['momentum_kama'] = momentum.kama(information['Close'])
information['trend_psar_up'] = pattern.psar_up(information['High'], information['Low'], information['Close'])
information['volume_vwap'] = quantity.VolumeWeightedAveragePrice(information['High'], information['Low'], information['Close'], information['Volume']).volume_weighted_average_price()
information['trend_ichimoku_a'] = pattern.ichimoku_a(information['High'], information['Low'])
information['volatility_kcl'] = volatility.KeltnerChannel(information['High'], information['Low'], information['Close']).keltner_channel_lband()
information['trend_ichimoku_b'] = pattern.ichimoku_b(information['High'], information['Low'])
information['trend_ichimoku_base'] = pattern.ichimoku_base_line(information['High'], information['Low'])
information['trend_sma_fast'] = pattern.sma_indicator(information['Close'], 20)
information['volatility_dcm'] = volatility.DonchianChannel(information['High'],
information['Low'],
information['Close']).donchian_channel_mband()
information['volatility_bbl'] = volatility.BollingerBands(information['Close']).bollinger_lband()
information['volatility_bbm'] = volatility.BollingerBands(information['Close']).bollinger_mavg()
information['volatility_kcc'] = volatility.KeltnerChannel(information['High'],
information['Low'],
information['Close']).keltner_channel_mband()
information['volatility_kch'] = volatility.KeltnerChannel(information['High'],
information['Low'],
information['Close']).keltner_channel_hband()
information['trend_sma_slow'] = pattern.sma_indicator(information['Close'],
200)
information['trend_ema_fast'] = pattern.ema_indicator(information['Close'],
20)
information['volatility_dch'] = volatility.DonchianChannel(information['High'],
information['Low'],
information['Close']).donchian_channel_hband()
information['others_cr'] = others.cumulative_return(information['Close'])
information['Adj Close'] = information['Close']
return information
@classmethod
def get_price_change(cls, candlesticks) -> pd.DataFrame:
return candlesticks['Close'].pct_change().dropna()
@classmethod
def plot_indicators(cls, information):
# Including technical indicators
information = cls.add_ta_features(information)
# Plotting
plt.determine(figsize=(14, 10))
plt.plot(information.index, information['Close'], label='Shut Worth', coloration='blue')
plt.plot(information.index, information['trend_ema_slow'], label='EMA 50', linestyle='--', coloration='orange')
plt.plot(information.index, information['trend_sma_fast'], label='SMA 20', linestyle='--', coloration='inexperienced')
plt.plot(information.index, information['trend_sma_slow'], label='SMA 200', linestyle='--', coloration='crimson')
plt.plot(information.index, information['momentum_kama'], label='KAMA', linestyle='--', coloration='purple')
plt.plot(information.index, information['volume_vwap'], label='VWAP', linestyle='--', coloration='brown')
plt.plot(information.index, information['volatility_bbm'], label='Bollinger Bands Center', linestyle='--', coloration='magenta')
plt.plot(information.index, information['volatility_bbl'], label='Bollinger Bands Decrease', linestyle='--', coloration='cyan')
plt.plot(information.index, information['volatility_kcc'], label='Keltner Channel Center', linestyle='--', coloration='grey')
plt.plot(information.index, information['trend_ichimoku_base'], label='Ichimoku Base Line', linestyle='--', coloration='black')
plt.title('Shut Worth with Technical Indicators')
plt.xlabel('Date')
plt.ylabel('Worth')
plt.legend()
plt.present()
By using these indicators, analysts can higher assess market tendencies, determine entry and exit factors, and make extra knowledgeable buying and selling selections. When two shares present similarities in monetary indicators, it suggests they might exhibit concurrent actions, enhancing the accuracy of study.
Dynamic Time Warping (DTW) is a technique used to measure the similarity between two time sequence that will range in velocity or size. In contrast to conventional strategies equivalent to Euclidean distance, which can not precisely replicate similarities resulting from alignment points, DTW aligns sequences in a time-warped method to permit for extra correct comparisons.
Initially developed for speech recognition, DTW has discovered purposes in varied fields, together with finance. As an example, it could examine two time sequence, A and B, the place conventional Euclidean distance may fail to seize their true similarity resulting from variations in time alignment.
DTW calculates similarity by minimizing the cumulative distance between aligned factors within the time sequence, providing a versatile and efficient option to measure the similarity between sequences that is probably not completely synchronized.
In inventory evaluation, the place worth patterns are pushed by a number of components, DTW can determine comparable patterns throughout totally different shares, even when they range in timing or magnitude. By aligning patterns optimally, DTW helps detect shares with comparable tendencies or actions.
By utilizing the FastDTW
library, we are able to calculate the dynamic time warping between two totally different time sequence.
from fastdtw import fastdtw
from scipy.spatial.distance import euclideanfrom src.Area.SimilarityDetector.PreProcessing import PreProcessor
class DTWCalculator:
@classmethod
def calculate(cls, time_series_1, time_series_2):
ts1_normalized = PreProcessor.normalize(time_series_1)
ts2_normalized = PreProcessor.normalize(time_series_2)
distance, _ = fastdtw(ts1_normalized.reshape(-1, 1), ts2_normalized.reshape(-1, 1), dist=euclidean)
return distance
Principal Part Evaluation (PCA) is a statistical approach used to scale back the dimensionality of information, making it precious for navigating high-dimensional areas equivalent to technical indicators in finance. PCA identifies “principal elements” that seize probably the most variance within the information, permitting for a reduced-dimensional illustration.
PCA entails a number of key steps:
- Standardization: Information is standardized to have a imply of zero and a variance of 1.
- Covariance Matrix Calculation: A covariance matrix is computed to seize relationships between variables.
- Eigenvalue and Eigenvector Calculation: Eigenvalues and eigenvectors are derived from the covariance matrix, representing the principal elements.
- Part Choice: Principal elements explaining probably the most variance are chosen.
- Information Transformation: Information is projected into the house outlined by the principal elements.
- Outcome Interpretation: Evaluation of principal elements helps determine patterns and important options.
Earlier than making use of DTW, it’s important to scale back the dimensionality of technical indicators utilizing PCA resulting from their excessive interdependence. This course of eliminates redundant information and tendencies, bettering the effectiveness of sample recognition.
from matplotlib import pyplot as plt
from sklearn.decomposition import PCAclass PCAFeatureReducer:
@classmethod
def reduce_feature(cls, information, n_components=3):
pca = PCA(n_components=n_components)
reduced_features = pca.fit_transform(information)
return reduced_features
@classmethod
def plot_pca_and_close(cls, information, reduced_features):
plt.determine(figsize=(14, 10))
# Plotting the Shut Worth
plt.plot(information.index, information['Close'], label='Shut Worth', coloration='blue')
# Plotting the PCA diminished options
for i in vary(reduced_features.form[1]):
plt.plot(information.index, reduced_features[:, i], label=f'PCA Part {i + 1}')
plt.title('Shut Worth with PCA-Decreased Options')
plt.xlabel('Date')
plt.ylabel('Worth')
plt.legend()
plt.present()
In abstract, combining DTW and PCA allows extra correct and environment friendly evaluation of inventory worth patterns by aligning sequences and decreasing dimensionality, thus facilitating higher detection of comparable tendencies and actions.
One option to calculate the gap of two totally different shares and time sequence based mostly on the what ? mentioned in his article to introduce a composite index by defining a weighting coefficient after which calculating the composite distance of two totally different time sequence. So, we are able to outline this coefficient as under:
# w is a coefficient
D_composite = D_time_series + w * D_technical_distan
The output obtained from the earlier strategies and steps is a matrix of mixed distances derived from the discount of technical pattern distances and the time sequence themselves. To make the most of the outcomes of those tendencies and choose shares with different tendencies, information sorting is carried out based mostly on rising distances. It is because, within the DTW technique, a smaller distance signifies better similarity between two totally different time sequence.
Figuring out shares with comparable tendencies in inventory worth actions is essential for enhancing liquidity and suppleness in a liquid portfolio, notably in inefficient markets, and for funding freedom. Whereas the applying of DTW in figuring out inventory worth patterns, as demonstrated on this examine, may be efficient, it additionally has a number of weaknesses.
The strategy offered on this examine is a place to begin for utilizing a mixture of DTW and PCA to create a composite index for sample recognition in inventory worth information. Due to this fact, there are limitations and areas for enchancment that ought to be thought-about. Regardless of these limitations, there are methods to enhance this method, lots of which have been addressed on this examine.
- Parameter Optimization By means of Cross-Validation: The efficiency of DTW and PCA methods closely relies on parameter choice, such because the weighting coefficient in distance calculation, thought-about time intervals, and chosen variables in PCA. At the moment, there is no such thing as a systematic technique for parameter tuning, however including methods like grid search or evolutionary optimization algorithms might assist discover an optimum set of parameters that maximizes the accuracy and reliability of the sample recognition and prediction technique.
- Assumption of Repeating Patterns: One of many key assumptions within the DTW technique is that patterns in time sequence will reappear sooner or later. This assumption could not all the time maintain within the extremely risky and multifaceted inventory market, influenced by quite a few, typically unprecedented, variables. This assumption, that previous worth patterns will repeat sooner or later, is a standard simplification underlying technical evaluation in monetary markets. Nonetheless, it ought to be famous that market dynamics evolve, and historic patterns don’t essentially repeat or result in correct predictions.
- Univariate Evaluation: The present implementation of DTW focuses primarily on inventory costs and ignores different influential components equivalent to buying and selling quantity or volatility, which might present vital insights into market habits and improve prediction functionality. Together with these further points can supply a extra complete understanding of inventory worth actions. Engineering options and increasing the function set by including extra numerous predictive indicators or different information sources might additionally enhance the accuracy of the sample recognition technique.
- Computational Load: Particularly with bigger datasets, DTW can create a excessive computational burden resulting from its quadratic time complexity, resulting in inefficiencies due to its complexities. When utilized to cost information, the in depth vary of inventory costs and time intervals could end in longer processing instances and important useful resource use. One option to handle that is by utilizing optimization strategies, equivalent to FastDTW, which reduces time complexity by approximating DTW distances, which may be essential in managing computational wants.
- Sensitivity to Noise: DTW’s sensitivity to noise in information can have an effect on sample recognition accuracy, the place short-term fluctuations could disproportionately affect the DTW distance calculation. Though technical variables are used on this examine to determine worth motion tendencies, this situation may introduce errors in evaluating the worth tendencies of two inventory time sequence. Designing mechanisms to determine and alter for anomalous costs or excessive occasions might enhance the accuracy and robustness of the sample recognition and prediction technique. One method to smoothing anomalies is utilizing shifting common strategies, which have been thought-about as an indicator on this examine. Nonetheless, in uncooked time sequence distance calculation, implementing noise discount strategies may also help separate necessary patterns by decreasing the affect of short-term information anomalies.
- Integration with Different Machine Studying Fashions: Combining DTW with different machine studying fashions can improve predictive and sample recognition capabilities by integrating exact sample identification with predictive validation.
- Multi-Asset Sample Recognition: Increasing the scope to incorporate numerous property and even totally different markets might present richer evaluation. By analyzing patterns throughout property or associated sectors, broader market tendencies or recurring patterns not restricted to the first analyzed asset is likely to be found. For instance, comparable tendencies may be noticed between the habits of sectoral funds and bodily gold within the pattern market, which might develop liquidity.
Regardless of the strengths of the DTW + PCA technique, discovering an acceptable threshold for detecting similarities between two shares and tendencies requires learning each comparable and dissimilar samples. To handle this, the output from the DTW + PCA technique can be utilized to coach a machine studying mannequin with numerous comparable and dissimilar samples, making a mannequin able to figuring out co-trending and non-co-trending shares.
On this examine, for the reason that drawback entails figuring out comparable and dissimilar shares relative to the goal inventory, it’s categorized as a classification drawback. Primarily based on the issue kind, the quantity of accessible information, and different research mentioned additional, the Random Forest Classification mannequin has been chosen for this analysis.
Artificial information technology refers back to the course of of making information which are produced by pc algorithms and resemble actual information, contemplating the particular situations of the analysis. This method is especially helpful when actual information are restricted, delicate, or costly. Artificial information may also help machine studying fashions study higher and carry out extra successfully.
On this examine, the usage of artificial information is important as a result of lack of enough actual information for particular shares and the restricted variety of shares within the inventory markets. Moreover, the enterprise method to co-movement information is one other necessary facet. For instance, a client may outline a inventory with a profitability pattern and cumulative revenue distinction starting from -5% to +5% as a co-movement inventory, whereas others may need totally different limitations on this regard.
Strategies of Artificial Information Technology
1 – Generative Adversarial Networks (GANs)
GANs include two neural networks: a generator and a discriminator. These two networks are skilled concurrently, the place the generator creates artificial information which are indistinguishable from actual information, and the discriminator tries to distinguish between actual and artificial information. The generator produces random information samples, and the discriminator makes an attempt to determine whether or not these samples are actual or synthetic. Over time and thru repeated iterations, the generator learns to supply high-quality artificial information.
2 — Variational Autoencoders (VAEs)
VAEs are a kind of autoencoder designed to generate artificial information utilizing probabilistic distributions. These fashions use a neural community to compress information right into a latent house after which reconstruct the information from this latent house. Initially, enter information are mapped to a latent house, after which new information are generated from this house. This technique helps create artificial information which have an analogous distribution to actual information.
3 — Information Augmentation
Information augmentation is a way the place random modifications are utilized to present information to create a bigger and extra numerous coaching dataset. These modifications can embrace rotation, scaling, cropping, including noise, and so on. By making use of varied transformations to actual information, new samples are generated, which helps the mannequin higher reply to modifications and fluctuations within the information.
4 – Simulations
Simulations are a way for producing artificial information utilizing mathematical and bodily fashions that mimic real-world processes. This technique is especially utilized in engineering, physics, and economics. By using simulation fashions, artificial information with comparable traits to actual information may be produced. These fashions are normally designed based mostly on scientific rules and legal guidelines.
Artificial Information Technology in This Research
Within the current analysis, as a result of restricted quantity of related information on co-movement shares and the restricted variety of symbols, coaching a machine-learning mannequin is significantly difficult and complicated. Moreover, a machine studying mannequin doesn’t have any pre-training analysis of the similarity or dissimilarity of enter information. Due to this fact, artificial information must be generated randomly with some controls.
The advantages of utilizing artificial information on this analysis embrace:
- Growing Information Quantity: Producing artificial information helps improve the dataset dimension, which ends up in improved accuracy and efficiency of machine studying fashions.
- Price and Time Discount: Amassing and labeling actual information may be time-consuming and dear. Artificial information technology gives a cost-effective and speedy answer for creating massive datasets.
- Privateness and Safety: Actual information could include delicate and private data. Artificial information may also help protect privateness and information safety.
- Bettering Mannequin Generalization: Various artificial information assist the mannequin adapt higher to variations and modifications in information, bettering its generalization.
Nonetheless, there are concerns when producing artificial information:
- Information High quality: One of many foremost challenges is making certain the standard and accuracy of the generated information. Artificial information should carefully resemble actual information for the mannequin to study successfully.
- Avoiding Overfitting: Artificial information may result in overfitting, particularly if the generated information lacks enough range. Care should be taken to make sure that artificial information assist the mannequin study new patterns successfully.
- Complexity of Generative Fashions: Generative fashions like GANs and VAEs are sometimes advanced and require important computation and fine-tuning, which might improve the time and sources wanted for artificial information technology.
Initially, the similarity situations of the 2 shares ought to be thought-about based mostly on worth candles. In keeping with research (Cristian Velasquez) and (Mengxia et al.) and different sources, the next situations may be thought-about for co-movement:
- The inventory tendencies, together with rises, falls, and course modifications, ought to be as comparable as attainable.
- The worth modifications inside a selected time vary for each shares shouldn’t exceed an error margin.
- The principle pattern ought to be evaluated on the closing worth information.
- Most, and minimal costs and quantity information ought to be random and according to the preliminary information.
- Shares can have a number of fluctuations in numerous intervals.
- Shares can have Gaussian noise.
Primarily based on these situations, two capabilities are used: one for producing artificial information just like the primary information and pattern, and one other for producing artificial information with a pattern totally different from the primary inventory. Moreover, some capabilities and situations are created to satisfy the necessities for these information. After this, machine studying may be utilized utilizing the artificial information earlier than analyzing the similarity of the shares.
To generate an analogous pattern we outline a easy class as under and we make sure that our generated time sequence has the specification launched for our studying objective.
import random
from multiprocessing import Poolimport pandas as pd
import numpy as np
class SimilarCandleGeneratorNew:
def __init__(self, base_df):
self.__base_df: pd.DataFrame = base_df
self.__calculate_indicators(self.__base_df)
self.__base_df['Date'] = self.__base_df.index.date
self.__counter = 0
@staticmethod
def generate_multiprocessing_candlestick(args):
self, price_range, volatility_range, fluctuation_range, volume_range, return_window, main_candle = args
return self.generate_sample(price_range, volatility_range, fluctuation_range, volume_range, return_window, main_candle)
@classmethod
def __calculate_indicators(cls, candle_df):
candle_df['EMA_12'] = candle_df['Close'].ewm(span=12, alter=False).imply().bfill()
candle_df['EMA_26'] = candle_df['Close'].ewm(span=26, alter=False).imply().bfill()
candle_df['SMA_12'] = candle_df['Close'].rolling(window=12).imply().bfill()
candle_df['SMA_26'] = candle_df['Close'].rolling(window=26).imply().bfill()
candle_df['Returns'] = candle_df['Close'].ffill().pct_change()
candle_df['Cumulative_Returns'] = (1 + candle_df['Returns']).cumprod() - 1
def __generate_hourly_candles(self, open_price, high_constraint, low_constraint, volume_range, date, main_candle):
hourly_candles = []
previous_close = open_price
for hour in vary(24):
quantity = random.randint(*volume_range)
# Guarantee comparable pattern and return
main_candle_return = (main_candle['Close'] - main_candle['Open']) / main_candle['Open']
price_change = main_candle_return * previous_close + np.random.uniform(-0.001, 0.001) * previous_close
fluctuation = np.random.uniform(0.001, 0.01) * previous_close
open_price = previous_close
low_price = max(low_constraint, open_price - fluctuation)
high_price = min(high_constraint, open_price + fluctuation)
close_price = open_price + price_change
# Guarantee shut worth is inside bounds
close_price = max(low_price, min(close_price, high_price))
generated_candle = {
'OpeningTime': f'{date} {hour:02d}:00:00',
'Open': open_price,
'Excessive': high_price,
'Low': low_price,
'Shut': close_price,
'Quantity': quantity,
}
hourly_candles.append(generated_candle)
previous_close = close_price
return hourly_candles
def __generate_daily_candles(self, previous_close, volume_range, main_candle):
all_candlesticks = []
for date, group in self.__base_df.groupby('Date'):
daily_open = previous_close
daily_high_constraint = daily_open * 1.05
daily_low_constraint = daily_open * 0.95
daily_candles = self.__generate_hourly_candles(daily_open, daily_high_constraint, daily_low_constraint, volume_range, date, main_candle)
all_candlesticks.lengthen(daily_candles)
previous_close = np.imply([candle['Close'] for candle in daily_candles])
return all_candlesticks
def __generate_sample(self, price_range, volatility_range=(0.001, 0.99), fluctuation_range=(0.001, 0.99),
volume_range=(1000, 10000), return_window=0.05, main_candle=None):
initial_close = self.__base_df.iloc[0]['Close']
all_candlesticks = self.__generate_daily_candles(initial_close, volume_range, main_candle)
new_df = pd.DataFrame(all_candlesticks)
new_df['OpeningTime'] = pd.to_datetime(new_df['OpeningTime'])
new_df.set_index('OpeningTime', inplace=True)
return new_df
def __ensure_similarity(self, sample_df, main_candle):
base_returns = self.__base_df['Returns'].dropna()
sample_returns = sample_df['Returns'].dropna()
# Align the indices of the bottom and pattern returns
aligned_base_returns, aligned_sample_returns = base_returns.align(sample_returns, be a part of='internal')
# Examine if the pattern returns are just like the primary candle returns
main_candle_return = (main_candle['Close'] - main_candle['Open']) / main_candle['Open']
sample_returns_mean = sample_returns.imply()
return np.isclose(sample_returns_mean, main_candle_return, atol=0.05)
def generate_sample(self, price_range, volatility_range=(0.1, 0.5), fluctuation_range=(0.1, 0.5),
volume_range=(1000, 10000), return_window=0.05, main_candle=None):
print(f'Producing comparable pattern quantity {self.__counter}')
attempt:
sample_df = self.__generate_sample(price_range,
volatility_range=volatility_range,
fluctuation_range=fluctuation_range,
volume_range=volume_range,
return_window=return_window,
main_candle=main_candle)
if self.__ensure_similarity(sample_df, main_candle):
return sample_df
else:
return None
besides Exception as e:
print(e)
def generate_samples(self, price_range,
sampling_count: int = 100,
volatility_range=(0.001, 0.5),
fluctuation_range=(0.001, 0.5),
volume_range=(1000, 10000), return_window=0.05, main_candle=None):
self.__counter = 0
print('Related pattern generator begin to generate samples')
with Pool() as pool:
duties = [(self, price_range, volatility_range, fluctuation_range, volume_range, return_window, main_candle)
for _ in range(sampling_count)]
generated_candles = pool.map(SimilarCandleGeneratorNew.generate_multiprocessing_candlestick, duties)
return [candle for candle in generated_candles if candle is not None]
under are proven among the comparable datasets generated examples: