Feature Extraction
Feature Extraction
NLP에서 특징 추출은 원시 텍스트 데이터를 기계 학습 모델에서 사용할 수 있는 수치 표현으로 변환하는 프로세스를 말합니다.
텐서의 모양은 입력 텍스트의 토큰 수와 BERT 모델에서 숨겨진 레이어의 크기에 따라 달라집니다.
"bert-base-uncased" 모델의 경우 각 토큰은 768차원 벡터로 표현되므로 결과의 모양은 (1, number_of_tokens, 768)이 되며, 여기서 1은 배치 크기가 1임을 나타냅니다(단일 문장을 처리)
from transformers import pipeline
extractor = pipeline(
model="google-bert/bert-base-uncased",
task="feature-extraction"
)
result = extractor("This is a simple test.", return_tensors=True)
result.shape
> torch.Size([1, 8, 768])
from transformers import pipeline
checkpoint = "facebook/bart-base"
feature_extractor = pipeline(
"feature-extraction",
framework="pt",
model=checkpoint
)
text = "Transformers is an awesome library!"
#Reducing along the first dimension to get a 768 dimensional array
result = feature_extractor(text,return_tensors = "pt")[0].numpy().mean(axis=0)
result
array([ 2.65979469e-01, -6.56846821e-01, 1.25627622e-01, 1.36601961e+00,
-7.59062350e-01, 7.56494999e-01, -1.05783379e+00, -1.87183488e-02,
-6.71836197e-01, -1.96252692e+00, 2.81317592e-01, 1.35964262e+00,
-7.21477158e-03, 9.12114024e-01, 3.43531281e-01, -1.35358238e+00,
-7.05923915e-01, 1.40108848e+00, 9.22238588e-01, -1.89351216e-01,
2.40267381e-01, -2.19243467e-01, 9.30557787e-01, 7.81986415e-01,
-1.01491988e+00, -5.85219324e-01, 6.38441265e-01, -4.86978149e+00,
-1.68945134e-01, -3.85040236e+00, -9.25553560e-01, -1.71881378e-01,
5.59025332e-02, -6.87882125e-01, -1.03691256e+00, -3.37629229e-01,
4.45500523e-01, 2.23247707e-01, 5.60621440e-01, 4.59313184e-01,
2.38988829e+00, 6.36417508e-01, -8.32767725e-01, 3.60407889e-01,
-2.03047842e-01, 9.03682828e-01, -1.65837502e+00, 1.19581902e+00,
-2.48081341e-01, 8.12374890e-01, -4.88460273e-01, 2.99310595e-01,
5.61737835e-01, -3.31205308e-01, -1.28257060e+00, -4.96809697e-03,
7.20308483e-01, -4.77443188e-02, 4.61459041e-01, 1.31555617e+00,
-2.52127767e-01, -2.13813365e-01, -8.04339498e-02, -7.61193752e-01,
-5.84884942e-01, -7.94288933e-01, -7.32236505e-01, 1.45702612e+00,
7.83099055e-01, -4.38804209e-01, 3.54020238e-01, 7.75779635e-02,
1.90865564e+00, -1.15279332e-01, -2.89797974e+00, 4.35574591e-01,
1.95469415e+00, -1.57004988e+00, -6.98929250e-01, 1.97130308e-01,
2.16544896e-01, 1.79927900e-01, -4.03966725e-01, -5.33977710e-02,
4.32771295e-02, -6.61527812e-01, -7.12189525e-02, -4.96350408e-01,
3.23695511e-01, 3.63479644e-01, -7.13343740e-01, 9.46864963e-01,
-4.25351888e-01, -1.62067080e+00, 4.80425507e-01, 6.41202092e-01,
2.59259790e-01, 5.41801155e-01, -3.12652916e-01, -7.57046163e-01,
1.49013770e+00, 2.58979249e+00, -1.38797617e+00, 6.22176707e-01,
-4.93953610e-03, 1.16229892e+00, 1.27965415e+00, -7.59564459e-01,
4.44990993e-01, -1.19959974e+00, -2.21594334e-01, -1.01445842e+00,
-1.89106047e-01, -3.31210196e-01, 8.83097172e-01, -1.40546516e-01,
3.58635008e-01, -4.30313945e-01, 1.20075715e+00, -3.19993407e-01,
1.29259288e+00, 1.60403013e+00, -1.70512187e+00, 4.31367129e-01,
-4.54189837e-01, 1.58637106e-01, -7.68502951e-01, 5.44382691e-01,
-7.79063642e-01, 7.52042174e-01, 3.63594383e-01, 1.50957608e+00,
4.73330081e-01, 5.57976484e-01, 3.91978323e-01, -1.21539712e+00,
-1.84798136e-01, 8.70272458e-01, -1.24984539e+00, -3.81815940e-01,
-2.21528202e-01, -2.32662141e-01, 1.01794183e+00, 4.10634369e-01,
-2.76572406e-01, -7.22168162e-02, 7.32802749e-02, -4.83183920e-01,
2.79749595e-02, -1.90762311e-01, -2.02779293e+00, -1.51625490e-02,
2.34252289e-01, 1.20170951e+00, 5.16299963e-01, -1.41489655e-01,
-2.46584147e-01, 1.91934600e-01, -2.89843321e-01, 7.42309749e-01,
-2.88151741e-01, -8.30338299e-01, 1.08718443e+00, 7.55699515e-01,
1.44116759e-01, -1.84054241e-01, -9.06795204e-01, 2.49311715e-01,
-3.42900723e-01, 8.78563583e-01, 3.04132570e-02, 1.34575880e+00,
-1.85381448e+00, 6.08434156e-02, -1.42206645e+00, 6.67507231e-01,
1.34545970e+00, 4.45354342e-01, 2.74102747e-01, -1.42620504e+00,
1.66471791e+00, 4.83929425e-01, -3.44172597e-01, 7.31653988e-01,
-1.34243101e-01, 3.96722883e-01, 6.22375488e-01, 3.31292927e-01,
-3.20087582e-01, -1.07533097e+00, 9.76715624e-01, -3.52599472e-01,
-1.31508529e-01, -1.68829501e-01, -1.38619637e+00, 8.90181839e-01,
-5.07539250e-02, 3.07513736e-02, -3.93885255e-01, -1.71657515e+00,
1.08871162e+00, 4.53436106e-01, 1.21618068e+00, 5.08168757e-01,
-6.95172369e-01, -9.72706795e-01, -6.98203564e-01, -3.77047509e-02,
5.14743686e-01, 7.26660311e-01, 1.42454967e-01, 1.70566046e+00,
2.11760625e-01, 7.90104866e-01, -1.11895156e+00, 9.30911675e-02,
-8.53542924e-01, -3.47614557e-01, -8.34994614e-01, -2.87239719e-02,
1.74926448e+00, -1.14972591e+00, 7.42217958e-01, -5.98051906e-01,
4.17294323e-01, 9.28575099e-02, -1.65214360e+00, 2.74253279e-01,
-6.29156351e-01, -6.69885874e-01, -6.18473053e-01, 4.31621671e-01,
-5.20275354e-01, 1.36028194e+00, 1.14785850e+00, -7.18702853e-01,
7.51556270e-03, 1.28168547e+00, 4.58477229e-01, -6.67250633e-01,
-8.71611178e-01, 1.54485416e+00, 9.50916409e-01, 1.56043792e+00,
4.03211296e-01, 1.39242363e+00, -1.03513181e-01, 5.22949100e-01,
6.63481355e-01, -2.78556813e-02, -7.29202271e-01, -3.53717595e-01,
-1.51166081e+00, -1.01856375e+00, 6.35557413e-01, -1.74585164e+00,
1.15104437e+00, 1.59911370e+00, 1.51997781e+00, 3.44798148e-01,
-1.03017783e+00, -3.45427930e-01, -3.04957658e-01, -1.52942911e-01,
-5.44561386e-01, 2.72866637e-01, -4.63098109e-01, 1.88960946e+00,
-6.39785528e-01, 4.92546350e-01, 3.46445173e-01, 1.83566287e-01,
3.37440491e-01, 8.80383372e-01, -1.57478404e+00, 2.41358012e-01,
3.42918366e-01, -1.77611008e-01, -1.44021642e+00, -4.05111313e-01,
9.36335549e-02, 1.14355230e+00, 5.38416505e-01, -1.01677418e+00,
-6.91158831e-01, 1.24062963e-01, 2.08523095e-01, 9.39111337e-02,
9.54437554e-01, -9.73225534e-02, 1.23226786e+00, 1.23787582e+00,
-3.61190140e-01, 9.04121161e-01, 1.90456226e-01, 3.66956741e-01,
-8.37930024e-01, 5.74374676e-01, -1.30378753e-01, 1.15551674e+00,
7.32960701e-01, -6.51617289e-01, 1.14175498e+00, 4.53374743e-01,
-1.02572456e-01, 1.62613416e+00, 2.79911518e-01, -9.57020462e-01,
-1.28427362e+00, -3.79492283e-01, 1.21321309e+00, -6.52288198e-01,
-7.16957331e-01, 8.21545362e-01, -2.78386503e-01, 1.13116992e+00,
1.09877765e+00, -9.30897295e-01, 3.44983310e-01, 8.14800978e-01,
-4.39907372e-01, 3.38642240e-01, -4.09136683e-01, 1.71418175e-01,
5.85774332e-02, 6.26304924e-01, 1.14245892e+00, -7.30417669e-02,
4.70405728e-01, 2.00712815e-01, -1.10264778e+00, 6.69935524e-01,
1.26470280e+00, 9.37951952e-02, -9.04536903e-01, 7.71072134e-02,
6.09872103e-01, 4.51007843e-01, 9.30153728e-01, -3.91749442e-01,
6.32270098e-01, 2.91940570e-01, 6.22446001e-01, 5.41503489e-01,
4.00291502e-01, 5.99478722e-01, 8.12314451e-02, 8.11395943e-02,
4.41665947e-01, 1.10141015e+00, -9.73566175e-01, -1.45525181e+00,
5.64943314e-01, -2.38152429e-01, 1.41102523e-01, -7.86105156e-01,
8.86056185e-01, -7.10839093e-01, 3.27000290e-01, 5.97459912e-01,
-1.95455417e-01, -5.74278355e-01, 1.28140464e-01, 8.05716038e-01,
-4.99953538e-01, 3.17015976e-01, 3.11146140e-01, 2.31078458e+00,
-6.16076440e-02, -4.31919515e-01, -1.45730332e-01, 6.16032660e-01,
-9.90341902e-01, -8.43507424e-02, -8.87243569e-01, -3.43032390e-01,
1.29097724e+00, -6.21537507e-01, 5.30034304e-01, 2.00960255e+00,
-3.07885379e-01, -3.07402015e-01, -2.99035162e-01, 1.58422029e+00,
6.66770160e-01, 9.46137309e-01, 1.00991011e+00, -4.43267673e-01,
-1.53554392e+00, -3.11964750e-01, -9.86443758e-02, 2.98534557e-02,
-2.61718333e-01, -2.58908939e+00, -4.89793241e-01, 2.49610424e+00,
5.08717000e-01, 6.50168598e-01, 3.08177888e-01, 4.27572727e-01,
2.46583670e-01, -3.61945689e-01, 3.36508095e-01, 1.83386886e+00,
-3.30805369e-02, 4.69550401e-01, -2.56731361e-01, -5.89862108e-01,
1.82226884e+00, 4.07911837e-01, -5.95187187e-01, 8.44984949e-02,
2.93200195e-01, 2.97689795e+00, 1.81511497e+00, 1.09478426e+00,
6.32766724e-01, -3.60258728e-01, -1.17961836e+00, -6.41382575e-01,
-1.11094289e-01, -3.20806690e-02, 6.67168975e-01, -3.80334437e-01,
2.60429978e-01, 2.25518560e+00, 6.14036143e-01, -1.76677987e-01,
-1.77043125e-01, 2.15038013e+00, -7.47117698e-01, -6.94940984e-01,
-8.47300291e-01, -8.48803222e-02, 8.57784092e-01, 5.92671931e-01,
8.59628022e-01, -2.30567765e+00, -7.14714766e-01, 1.16440272e+00,
-1.58269092e-01, 1.78712055e-01, 6.23534322e-01, 5.20011425e-01,
1.34862280e+00, 4.30682451e-01, 1.31071520e+00, 7.05770731e-01,
8.79335642e-01, 1.04244995e+00, -4.27514404e-01, 1.63319424e-01,
-1.31811261e+00, 3.38772506e-01, 6.16423368e-01, 9.51805353e-01,
-4.15812321e-02, 1.17271984e+00, -1.08082747e+00, -3.87872487e-01,
-6.65958166e-01, -8.24503243e-01, -1.04688489e+00, 3.82272780e-01,
-4.10087854e-01, 6.21690869e-01, -3.57830405e-01, -2.88853496e-01,
-3.67524803e-01, -6.23959303e-01, -1.07978083e-01, -7.72217512e-01,
-3.48409325e-01, 5.99236190e-01, -3.06323647e-01, 6.41631603e-01,
1.45477855e+00, -2.87821621e-01, 1.15087187e+00, -8.66089165e-01,
1.25644231e+00, 1.78619817e-01, -3.97542268e-01, 1.23447955e+00,
3.76234233e-01, 2.45014504e-01, 1.80363189e-02, -1.10603869e+00,
8.92204285e-01, 6.63499773e-01, 1.25622559e+00, -1.29876032e-01,
-5.97272754e-01, -3.75236988e-01, 4.66275960e-01, 2.05121726e-01,
8.55793536e-01, -4.02110481e+00, 1.44663119e+00, 2.47043297e-01,
6.41006827e-01, -2.76829362e-01, 9.26169932e-01, 4.13330823e-01,
3.42508614e-01, 1.37350821e+00, -1.03768575e+00, 2.95568883e-01,
7.84581363e-01, -6.02763034e-02, -1.78127453e-01, -8.99840117e-01,
1.44706023e+00, 1.71805096e+00, 1.68698475e-01, -5.69088042e-01,
-1.05945960e-01, -2.80161768e-01, -6.83287263e-01, -1.20706224e+00,
-6.09945178e-01, -1.30536163e+00, -2.01481447e-01, -2.13613780e-03,
8.87428403e-01, -7.25608051e-01, 1.03411388e+00, 6.20733440e-01,
-2.92526424e-01, -7.22831935e-02, 2.99865872e-01, -9.95002985e-01,
9.87491369e-01, 2.62370020e-01, 3.95637065e-01, -8.62838387e-01,
-3.04033637e-01, 3.50857317e-01, -4.46875505e-02, 9.52754319e-02,
-1.81237090e+00, 1.28286946e+00, -1.01697290e+00, 3.33020598e-01,
5.44873066e-02, 8.81249309e-02, -8.30183446e-01, -1.25741291e+00,
1.13769807e-01, 4.17068511e-01, -3.84530395e-01, -2.24328697e-01,
1.64285719e-01, 2.24695787e-01, -1.52362788e+00, -1.03049445e+00,
5.07936776e-01, 6.00863695e-01, 6.78084612e-01, 5.73505223e-01,
9.01764214e-01, 2.14455634e-01, -8.64829645e-02, 5.21133542e-01,
-3.84391618e+00, -4.32377815e-01, 3.86084586e-01, -5.74579954e-01,
-6.64480180e-02, 5.68239272e-01, 1.40972987e-01, 4.91561055e-01,
-1.59076011e+00, 4.58789200e-01, -3.81277895e+00, -3.56776267e-01,
-7.03040540e-01, 4.01638448e-01, -4.14998889e-01, 9.21632528e-01,
-2.88249195e-01, 5.43611228e-01, -1.87583610e-01, 1.77437171e-01,
3.09019029e-01, -8.21537554e-01, 9.64059532e-01, -4.18440819e-01,
-3.74807954e-01, -7.94256628e-01, 1.79041013e-01, 1.07198226e+00,
7.37931907e-01, -5.71660399e-01, 4.73279476e-01, -1.12489057e+00,
-8.20027113e-01, 1.16821051e+00, 1.13403749e+00, -8.37848783e-01,
-6.94696903e-01, -5.87071300e-01, -9.37979996e-01, 2.85793412e-02,
-1.03942998e-01, 8.15052748e-01, 2.77216405e-01, 1.07841063e+00,
8.49105597e-01, -1.22114134e+00, 1.31575480e-01, 4.65513058e-02,
-9.81023669e-01, -1.33689547e+00, 9.66923177e-01, -1.29471350e+00,
1.38094172e-01, 6.99724734e-01, -3.32960755e-01, 1.33537698e+00,
-3.91558230e-01, 5.63477397e-01, -2.24221781e-01, -2.44365379e-01,
-1.46850929e-01, -1.11731577e+00, -6.03061169e-02, -4.82764870e-01,
9.59276438e-01, 2.19666290e+00, -9.22762871e-01, 3.80298674e-01,
3.07932347e-01, 1.24999022e+00, 3.42894405e-01, 9.13683604e-03,
-2.37314433e-01, -5.78179881e-02, -1.28482133e-01, 8.15542936e-01,
4.25670408e-02, 1.63982916e+00, -2.73039818e-01, -1.85594022e-01,
-8.31672192e-01, 9.44418490e-01, -8.35879385e-01, 8.57529119e-02,
-2.79763341e-01, 9.06917334e-01, 3.24907333e-01, 3.79474759e-02,
1.75888743e-02, -3.66290390e-01, -5.95124885e-02, 6.87758029e-01,
5.07672548e-01, 8.89262438e-01, 1.69256181e-01, -2.01039672e-01,
5.85848987e-01, 1.67553282e+00, 4.36515212e-01, -3.62415373e-01,
8.22778121e-02, 4.75326419e-01, 1.03972077e+00, 1.17408001e+00,
1.08113861e+00, 1.17782259e+00, 7.19923496e-01, -6.69488251e-01,
1.36961073e-01, 1.56454515e+00, -4.26551163e-01, -9.29266691e-01,
7.18483865e-01, 3.78707618e-01, 7.51538515e-01, 9.56296325e-02,
2.21472085e-01, 2.28069518e-02, 5.18463790e-01, 1.02798796e+00,
2.11316586e-01, 1.61214125e+00, 7.54863799e-01, -9.27332759e-01,
-1.86410949e-01, -1.12251091e+00, 3.68848145e-01, 4.18064177e-01,
7.76560783e-01, 2.79443599e-02, -3.13859940e-01, -4.47741836e-01,
4.84260559e-01, 1.09082341e+00, 1.55656290e+00, -3.40694696e-01,
4.41435456e-01, 7.73885310e-01, 6.59892440e-01, -1.66957736e+00,
-1.25799298e+00, 7.65725374e-01, 7.07073510e-01, -1.03319263e+00,
5.52317381e-01, 3.94287586e-01, 3.92046362e-01, -1.49054527e-01,
-5.83782732e-01, -5.09692848e-01, 5.62452197e-01, 6.57410741e-01,
-5.00763655e-01, 4.95452195e-01, 3.58396247e-02, -7.79511034e-01,
4.42655563e-01, -2.28039667e-01, 8.00651312e-02, 1.79112375e+00,
3.67924958e-01, 1.30259705e+00, 1.01184118e+00, -8.26271296e-01,
-1.88019410e-01, 1.58395958e+00, -5.49807787e-01, 5.25783479e-01,
-8.63637924e-01, -5.48161387e-01, 4.79667455e-01, 4.72317845e-01,
-3.08872998e-01, 8.75118449e-02, -8.59509230e-01, 7.50889957e-01,
-2.64737934e-01, 7.62561083e-01, -4.49841231e-01, 1.45644456e-01,
1.01997578e+00, 3.77051145e-01, -2.72834718e-01, 4.99737054e-01,
-1.25988948e+00, -5.44752657e-01, 2.32418612e-01, 9.05423999e-01,
-5.48643887e-01, 7.33481586e-01, -5.10832906e-01, 1.30848140e-01,
7.81549454e-01, -4.22700197e-01, -3.84838343e-01, 1.11054122e+00,
-9.99024928e-01, 1.36228776e+00, -6.92071795e-01, -3.62365395e-01,
-2.13862792e-01, 7.48088360e-01, -6.61671400e-01, 3.47315609e-01],
dtype=float32)
feature_extractor(text, return_tensors="pt"
: 입력 텍스트를 특징 추출 파이프라인으로 전달하고 결과를 PyTorch 텐서로 반환하도록 지정합니다. [0]: 입력 텍스트의 특징 표현에 해당하는 출력의 첫 번째 요소를 선택합니다..numpy()
: 파이토치 텐서를 NumPy 배열로 변환합니다..mean(axis=0)
: 시퀀스 차원(축 0)에 걸쳐 특징 벡터의 평균을 계산합니다. 이는 모든 토큰의 특징을 768차원의 단일 벡터로 집계하는 방법입니다(BART 베이스의 숨겨진 크기가 768이므로).
Last updated