from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')
Xtrain = train.data
ytrain = train.target
Xtest = test.data
ytest = test.target
print("X:", len(Xtrain))
print("y:", len(ytrain))
X: 11314 y: 11314
print("X[0]:", Xtrain[0])
print("y[0]:", ytrain[0])
X[0]: From: lerxst@wam.umd.edu (where's my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: 15 I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail. Thanks, - IL ---- brought to you by your neighborhood Lerxst ---- y[0]: 7
train.target_names
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english', ngram_range = (2, 2))
Xtrain_tfidf = tfidf.fit_transform(Xtrain)
Xtrain_tfidf
<11314x1056749 sparse matrix of type '<class 'numpy.float64'>' with 1770842 stored elements in Compressed Sparse Row format>
tfidf.vocabulary_
{'lerxst wam': 562902, 'wam umd': 1008884, 'umd edu': 973615, 'edu thing': 351291, 'thing subject': 939876, 'subject car': 905377, 'car nntp': 209754, 'nntp posting': 663944, 'posting host': 736180, 'host rac3': 478145, 'rac3 wam': 773601, 'edu organization': 350457, 'organization university': 689066, 'university maryland': 978057, 'maryland college': 606447, 'college park': 243862, 'park lines': 700651, 'lines 15': 573524, '15 wondering': 18126, 'wondering enlighten': 1028812, 'enlighten car': 362063, 'car saw': 209878, 'saw day': 832052, 'day door': 294856, 'door sports': 334002, 'sports car': 886491, 'car looked': 209696, 'looked late': 584204, 'late 60s': 554397, '60s early': 69555, 'early 70s': 343653, '70s called': 75916, 'called bricklin': 204906, 'bricklin doors': 191293, 'doors really': 334078, 'really small': 783016, 'small addition': 871343, 'addition bumper': 106831, 'bumper separate': 196642, 'separate rest': 849667, 'rest body': 805699, 'body know': 183891, 'know tellme': 546412, 'tellme model': 932511, 'model engine': 632813, 'engine specs': 361062, 'specs years': 883636, 'years production': 1049805, 'production car': 752235, 'car history': 209617, 'history info': 470437, 'info funky': 500984, 'funky looking': 418136, 'looking car': 584392, 'car mail': 209709, 'mail thanks': 597038, 'thanks il': 937326, 'il brought': 489699, 'brought neighborhood': 193078, 'neighborhood lerxst': 655884, 'guykuo carson': 451013, 'carson washington': 213084, 'washington edu': 1012140, 'edu guy': 349323, 'guy kuo': 450858, 'kuo subject': 549499, 'subject si': 907376, 'si clock': 861251, 'clock poll': 239494, 'poll final': 730718, 'final summary': 398859, 'summary final': 911712, 'final si': 398843, 'clock reports': 239503, 'reports keywords': 800071, 'keywords si': 539629, 'si acceleration': 861236, 'acceleration clock': 99496, 'clock upgrade': 239539, 'upgrade article': 980914, 'article shelley': 142072, 'shelley 1qvfo9innc3s': 857077, '1qvfo9innc3s organization': 31933, 'university washington': 978324, 'washington lines': 1012174, 'lines 11': 573475, '11 nntp': 10317, 'host carson': 477152, 'edu fair': 349083, 'fair number': 385241, 'number brave': 669616, 'brave souls': 190021, 'souls upgraded': 878573, 'upgraded si': 981063, 'clock oscillator': 239487, 'oscillator shared': 691194, 'shared experiences': 856124, 'experiences poll': 378051, 'poll send': 730737, 'send brief': 847780, 'brief message': 191379, 'message detailing': 619828, 'detailing experiences': 310588, 'experiences procedure': 378054, 'procedure speed': 750613, 'speed attained': 884077, 'attained cpu': 149325, 'cpu rated': 279277, 'rated speed': 777586, 'speed add': 884062, 'add cards': 106225, 'cards adapters': 210860, 'adapters heat': 106052, 'heat sinks': 463393, 'sinks hour': 866033, 'hour usage': 479186, 'usage day': 982304, 'day floppy': 294908, 'floppy disk': 403883, 'disk functionality': 323432, 'functionality 800': 417346, '800 floppies': 81698, 'floppies especially': 403827, 'especially requested': 367406, 'requested summarizing': 801311, 'summarizing days': 911573, 'days add': 295477, 'add network': 106361, 'network knowledge': 657430, 'knowledge base': 546842, 'base clock': 163069, 'upgrade haven': 980956, 'haven answered': 459419, 'answered poll': 126606, 'poll thanks': 730741, 'thanks guy': 937300, 'kuo guykuo': 549494, 'guykuo washington': 451014, 'twillis ec': 968971, 'ec ecn': 345809, 'ecn purdue': 346150, 'purdue edu': 763823, 'edu thomas': 351294, 'thomas willis': 943427, 'willis subject': 1022362, 'subject pb': 906898, 'pb questions': 706210, 'questions organization': 770244, 'organization purdue': 688726, 'purdue university': 763831, 'university engineering': 977897, 'engineering computer': 361259, 'computer network': 257176, 'network distribution': 657370, 'distribution usa': 325883, 'usa lines': 982113, 'lines 36': 573676, '36 folks': 51728, 'folks mac': 405471, 'mac plus': 593892, 'plus finally': 726931, 'finally gave': 398955, 'gave ghost': 423605, 'ghost weekend': 429055, 'weekend starting': 1017502, 'starting life': 892173, 'life 512k': 567146, '512k way': 64372, 'way 1985': 1013789, '1985 sooo': 25851, 'sooo market': 877490, 'market new': 605247, 'new machine': 658803, 'machine bit': 594215, 'bit sooner': 178921, 'sooner intended': 877478, 'intended looking': 508512, 'looking picking': 584663, 'picking powerbook': 718282, 'powerbook 160': 738240, '160 maybe': 19860, 'maybe 180': 609792, '180 bunch': 22617, 'bunch questions': 196771, 'questions hopefully': 770144, 'hopefully somebody': 476073, 'somebody answer': 876385, 'answer does': 126286, 'does anybody': 329161, 'anybody know': 127574, 'know dirt': 545126, 'dirt round': 320371, 'round powerbook': 819678, 'powerbook introductions': 738256, 'introductions expected': 512787, 'expected heard': 377048, 'heard 185c': 462396, '185c supposed': 23032, 'supposed make': 915439, 'make appearence': 598606, 'appearence summer': 129661, 'summer haven': 912032, 'haven heard': 459495, 'heard anymore': 462413, 'anymore don': 127738, 'don access': 332430, 'access macleak': 100486, 'macleak wondering': 595038, 'wondering anybody': 1028779, 'anybody info': 127560, 'info anybody': 500837, 'anybody heard': 127547, 'heard rumors': 462839, 'rumors price': 822158, 'price drops': 744729, 'drops powerbook': 339879, 'powerbook line': 738259, 'line like': 573001, 'like ones': 570242, 'ones duo': 681379, 'duo just': 341582, 'just went': 533361, 'went recently': 1018809, 'recently impression': 785983, 'impression display': 494354, 'display 180': 324180, '180 probably': 22642, 'probably swing': 748427, 'swing 180': 919369, '180 got': 22623, 'got 80mb': 438626, '80mb disk': 82152, 'disk 120': 323284, '120 don': 12458, 'don really': 333218, 'really feel': 782640, 'feel better': 392786, 'better display': 173686, 'display yea': 324550, 'yea looks': 1048059, 'looks great': 584916, 'great store': 444172, 'store wow': 899290, 'wow really': 1035057, 'really good': 782667, 'good solicit': 437936, 'solicit opinions': 875757, 'opinions people': 684054, 'people use': 710547, 'use 160': 982442, '160 180': 19818, '180 day': 22618, 'day day': 294834, 'day worth': 295407, 'worth taking': 1034428, 'taking disk': 924326, 'disk size': 323589, 'size money': 867902, 'money hit': 635332, 'hit active': 470682, 'active display': 103980, 'display realize': 324452, 'realize real': 782221, 'real subjective': 781736, 'subjective question': 908022, 'question ve': 769823, 've played': 995373, 'played machines': 724899, 'machines computer': 594632, 'computer store': 257309, 'store breifly': 899171, 'breifly figured': 190853, 'figured opinions': 396303, 'opinions somebody': 684104, 'somebody actually': 876383, 'actually uses': 105400, 'uses machine': 987216, 'machine daily': 594257, 'daily prove': 290262, 'prove helpful': 758985, 'helpful does': 466231, 'does hellcats': 329612, 'hellcats perform': 464891, 'perform thanks': 711441, 'thanks bunch': 937133, 'bunch advance': 196677, 'advance info': 109630, 'info email': 500958, 'email ll': 356635, 'll post': 579541, 'post summary': 735435, 'summary news': 911812, 'news reading': 660023, 'reading time': 780948, 'time premium': 948150, 'premium finals': 741279, 'finals just': 399122, 'just corner': 531827, 'corner tom': 272754, 'tom willis': 952702, 'willis twillis': 1022363, 'twillis ecn': 968972, 'edu purdue': 350649, 'purdue electrical': 763824, 'electrical engineering': 354930, 'engineering convictions': 361263, 'convictions dangerous': 270646, 'dangerous enemies': 291277, 'enemies truth': 360259, 'truth lies': 964372, 'lies nietzsche': 567071, 'jgreen amber': 525001, 'amber joe': 121040, 'joe green': 526631, 'green subject': 445052, 'subject weitek': 907838, 'weitek p9000': 1018174, 'p9000 organization': 695762, 'organization harris': 688167, 'harris computer': 458412, 'computer systems': 257319, 'systems division': 921353, 'division lines': 326602, 'lines 14': 573512, '14 distribution': 15660, 'distribution world': 325894, 'world nntp': 1033151, 'host amber': 476965, 'amber ssd': 121046, 'ssd csd': 888295, 'csd harris': 284599, 'harris com': 458411, 'com newsreader': 246663, 'newsreader tin': 660716, 'tin version': 949717, 'version pl9': 997535, 'pl9 robert': 721686, 'robert kyanko': 816307, 'kyanko rob': 549841, 'rob rjck': 816092, 'rjck uucp': 814392, 'uucp wrote': 990049, 'wrote abraxis': 1039584, 'abraxis iastate': 98230, 'iastate edu': 484962, 'edu writes': 351618, 'writes article': 1036213, 'article abraxis': 140433, 'abraxis 734340159': 98229, '734340159 class1': 76989, 'class1 iastate': 236740, 'edu know': 349845, 'know weitek': 546574, 'p9000 graphics': 695758, 'graphics chip': 442814, 'chip far': 227742, 'far low': 388382, 'low level': 588210, 'level stuff': 564578, 'stuff goes': 903782, 'goes looks': 435025, 'looks pretty': 584952, 'pretty nice': 743742, 'nice got': 661612, 'got quadrilateral': 439217, 'quadrilateral command': 767923, 'command requires': 249786, 'requires just': 802230, 'just points': 532748, 'points weitek': 729419, 'weitek address': 1018170, 'address phone': 107588, 'phone number': 716708, 'number like': 669940, 'like information': 569790, 'information chip': 501603, 'chip joe': 227800, 'green harris': 445004, 'harris corporation': 458414, 'corporation jgreen': 273082, 'jgreen csd': 525002, 'com computer': 245653, 'division thing': 326704, 'thing really': 939744, 'really scares': 782985, 'scares person': 835607, 'person sense': 713541, 'sense humor': 848535, 'humor jonathan': 482408, 'jonathan winters': 528075, 'jcm head': 522434, 'head cfa': 460996, 'cfa harvard': 220526, 'harvard edu': 458647, 'edu jonathan': 349703, 'jonathan mcdowell': 528060, 'mcdowell subject': 611495, 'subject shuttle': 907375, 'shuttle launch': 861039, 'launch question': 555594, 'question organization': 769545, 'organization smithsonian': 688867, 'smithsonian astrophysical': 872450, 'astrophysical observatory': 147094, 'observatory cambridge': 673931, 'cambridge ma': 206211, 'ma usa': 593495, 'usa distribution': 982048, 'distribution sci': 325844, 'sci lines': 837212, 'lines 23': 573608, '23 article': 37398, 'article c5owcb': 140998, 'c5owcb n3p': 201294, 'n3p world': 648322, 'world std': 1033326, 'std com': 895541, 'com tombaker': 247244, 'tombaker world': 952772, 'com tom': 247241, 'tom baker': 952543, 'baker article': 160780, 'article c5jlwx': 140873, 'c5jlwx 4h9': 201146, '4h9 cs': 61757, 'cs cmu': 284272, 'cmu edu': 241216, 'edu etrat': 349048, 'etrat ttacs1': 368992, 'ttacs1 ttu': 966040, 'ttu edu': 966138, 'edu pack': 350482, 'pack rat': 696161, 'rat writes': 777236, 'writes clear': 1036456, 'clear caution': 237445, 'caution warning': 216572, 'warning memory': 1011463, 'memory verify': 617807, 'verify unexpected': 996850, 'unexpected errors': 975841, 'errors wondering': 366443, 'wondering expected': 1028814, 'expected error': 377028, 'error sorry': 366231, 'sorry really': 877863, 'really dumb': 782592, 'dumb question': 341346, 'question parity': 769556, 'parity errors': 700579, 'errors memory': 366368, 'memory previously': 617686, 'previously known': 744475, 'known conditions': 547224, 'conditions waivered': 259682, 'waivered yes': 1008174, 'yes error': 1050503, 'error knew': 366134, 'knew curious': 544148, 'curious real': 286066, 'real meaning': 781541, 'meaning quote': 613353, 'quote tom': 771786, 'tom understanding': 952689, 'understanding expected': 975299, 'expected errors': 377029, 'errors basically': 366304, 'basically known': 164655, 'known bugs': 547193, 'bugs warning': 195165, 'warning software': 1011492, 'software things': 874977, 'things checked': 940110, 'checked don': 225399, 'don right': 333273, 'right values': 812553, 'values aren': 992853, 'aren set': 135158, 'set till': 853164, 'till launch': 946651, 'launch suchlike': 555628, 'suchlike fix': 909462, 'fix code': 401586, 'code possibly': 242278, 'possibly introduce': 734809, 'introduce new': 512618, 'new bugs': 658282, 'bugs just': 195128, 'just tell': 533205, 'tell crew': 931844, 'crew ok': 281492, 'ok warning': 679331, 'warning 213': 1011406, '213 liftoff': 35751, 'liftoff ignore': 568000, 'ignore jonathan': 488473, 'dfo vttoulu': 313286, 'vttoulu tko': 1005180, 'tko vtt': 950633, 'vtt fi': 1005177, 'fi foxvog': 394841, 'foxvog douglas': 411754, 'douglas subject': 335472, 'subject rewording': 907205, 'rewording second': 809390, 'second amendment': 842558, 'amendment ideas': 121288, 'ideas organization': 487218, 'organization vtt': 689125, 'vtt lines': 1005178, 'lines 58': 573733, '58 article': 66646, 'article 1r1eu1': 139937, '1r1eu1 4t': 32051, '4t transfer': 62479, 'transfer stratus': 958582, 'stratus com': 900481, 'com cdt': 245584, 'cdt sw': 217728, 'sw stratus': 918697, 'com tavares': 247191, 'tavares writes': 927370, 'article 1993apr20': 139413, '1993apr20 083057': 28710, '083057 16899': 4226, '16899 ousrvr': 20473, 'ousrvr oulu': 691973, 'oulu fi': 691948, 'fi dfo': 394830, 'douglas writes': 335475, 'article 1qv87v': 139883, '1qv87v 4j3': 31901, '4j3 transfer': 61826, 'article c5n3gi': 140975, 'c5n3gi f8f': 201267, 'f8f ulowell': 382353, 'ulowell ulowell': 973225, 'ulowell edu': 973224, 'edu jrutledg': 349730, 'jrutledg cs': 529558, 'cs ulowell': 284438, 'edu john': 349692, 'john lawrence': 527098, 'lawrence rutledge': 556697, 'rutledge writes': 824518, 'writes massive': 1037328, 'massive destructive': 607189, 'destructive power': 310374, 'power modern': 737944, 'modern weapons': 633810, 'weapons makes': 1016476, 'makes cost': 599839, 'cost accidental': 274437, 'accidental crimial': 100892, 'crimial usage': 281946, 'usage weapons': 982366, 'weapons great': 1016434, 'great weapons': 444256, 'weapons mass': 1016478, 'mass destruction': 606772, 'destruction need': 310331, 'need control': 653799, 'control government': 268704, 'government individual': 440598, 'individual access': 499057, 'access result': 100569, 'result needless': 806637, 'needless deaths': 655038, 'deaths millions': 298577, 'millions makes': 626004, 'makes right': 600040, 'right people': 812221, 'people bear': 708801, 'bear modern': 167083, 'weapons non': 1016492, 'non existant': 664568, 'existant thanks': 375600, 'thanks stating': 937626, 'stating coming': 894405, 'coming needless': 249456, 'needless say': 655042, 'say disagree': 832734, 'disagree count': 320599, 'count believe': 275458, 'believe individuals': 170155, 'individuals right': 499411, 'right weapons': 812582, 'destruction hard': 310318, 'hard believe': 457055, 'believe support': 170580, 'support neighbor': 914398, 'neighbor right': 655845, 'right nuclear': 812167, 'nuclear weapons': 669303, 'weapons biological': 1016356, 'biological weapons': 177788, 'weapons nerve': 1016490, 'nerve gas': 656294, 'gas property': 422974, 'property agree': 756678, 'agree keeping': 113779, 'keeping weapons': 536671, 'destruction hands': 310317, 'hands individuals': 455190, 'individuals hope': 499360, 'hope don': 475682, 'don sign': 333337, 'sign blank': 862090, 'blank checks': 180509, 'checks course': 225609, 'course term': 277604, 'term rigidly': 933819, 'rigidly defined': 813158, 'defined doug': 302653, 'doug foxvog': 335304, 'foxvog says': 411755, 'says weapons': 834745, 'destruction means': 310328, 'means cbw': 613580, 'cbw nukes': 216980, 'nukes sarah': 669366, 'sarah brady': 830318, 'brady says': 189374, 'means street': 614016, 'street sweeper': 900891, 'sweeper shotguns': 919157, 'shotguns semi': 859506, 'semi automatic': 847286, 'automatic sks': 152820, 'sks rifles': 869070, 'rifles doubt': 811393, 'doubt uses': 335197, 'uses term': 987342, 'term using': 933868, 'using quote': 988149, 'quote allegedly': 771631, 'allegedly john': 118544, 'rutledge says': 824514, 'destruction immediately': 310321, 'immediately follows': 492003, 'follows thousands': 406682, 'thousands people': 944504, 'people killed': 709624, 'killed year': 541393, 'year handguns': 1048603, 'handguns number': 454587, 'number easily': 669753, 'easily reduced': 344768, 'reduced putting': 788371, 'putting reasonable': 764976, 'reasonable restrictions': 784107, 'restrictions does': 806364, 'does rutledge': 330100, 'rutledge mean': 824511, 'mean term': 613130, 'term read': 933808, 'read article': 779728, 'article presenting': 141938, 'presenting argument': 742157, 'argument weapons': 136176, 'destruction commonly': 310308, 'commonly understood': 251830, 'understood switching': 975534, 'switching topics': 919808, 'topics point': 954377, 'point evidently': 728085, 'evidently weapons': 371398, 'weapons allowed': 1016327, 'allowed later': 119332, 'later analysis': 554615, 'analysis given': 123393, 'given understanding': 430643, 'understanding consider': 975281, 'consider class': 262983, 'class cdt': 236526, 'cdt rocket': 217727, 'rocket sw': 816953, 'com believe': 245453, 'believe speak': 170543, 'speak company': 881816, 'company cdt': 252891, 'cdt vos': 217730, 'vos stratus': 1004060, 'com write': 247398, 'write today': 1035858, 'today special': 951648, 'special investors': 882502, 'investors packet': 513630, 'packet doug': 696804, 'douglas foxvog': 335438, 'foxvog vtt': 411756, 'bmdelane quads': 182340, 'quads uchicago': 767930, 'uchicago edu': 971749, 'edu brian': 348397, 'brian manning': 191167, 'manning delaney': 602893, 'delaney subject': 303768, 'subject brain': 905308, 'brain tumor': 189507, 'tumor treatment': 966598, 'treatment thanks': 960409, 'thanks reply': 937556, 'reply bmdelane': 798361, 'bmdelane midway': 182339, 'midway uchicago': 624087, 'university chicago': 977822, 'chicago lines': 226476, 'lines 12': 573487, '12 people': 12221, 'people responded': 710137, 'responded request': 804839, 'request info': 801115, 'info treatment': 501285, 'treatment astrocytomas': 960268, 'astrocytomas email': 146868, 'email couldn': 356453, 'couldn thank': 275287, 'thank directly': 936857, 'directly mail': 319818, 'mail bouncing': 596408, 'bouncing probs': 187822, 'probs sean': 750524, 'sean debra': 841380, 'debra sharon': 298805, 'sharon thought': 856414, 'thought publicly': 944083, 'publicly thank': 762273, 'thank thanks': 936996, 'thanks sure': 937636, 'sure glad': 916133, 'glad accidentally': 431504, 'accidentally hit': 100918, 'hit rn': 470890, 'rn instead': 815442, 'instead rm': 506470, 'rm trying': 815205, 'trying delete': 965414, 'delete file': 303935, 'file september': 397311, 'september hmmm': 849945, 'hmmm news': 471938, 'news brian': 659752, 'bgrubb dante': 174649, 'dante nmsu': 291574, 'nmsu edu': 663868, 'edu grubb': 349300, 'grubb subject': 447747, 'subject ide': 906211, 'ide vs': 486387, 'vs scsi': 1004986, 'scsi organization': 840412, 'organization new': 688566, 'new mexico': 658848, 'mexico state': 621595, 'state university': 893156, 'university las': 978028, 'las cruces': 554117, 'cruces nm': 283232, 'nm lines': 663804, 'lines 44': 573698, '44 distribution': 58554, 'host dante': 477297, 'edu dxb132': 348950, 'dxb132 psuvm': 342326, 'psuvm psu': 760873, 'psu edu': 760854, 'article 1qlbrlinn7rk': 139747, '1qlbrlinn7rk dns1': 31713, 'dns1 nmsu': 327594, 'edu bgrubb': 348296, 'grubb says': 447746, 'says pc': 834547, 'pc magazine': 706553, 'magazine april': 595613, 'april 27': 132867, '27 1993': 41721, '1993 29': 27182, '29 scsi': 43640, 'scsi twice': 840485, 'twice fasst': 968878, 'fasst esdi': 389085, 'esdi 20': 366983, '20 faster': 33482, 'faster ide': 389544, 'ide support': 486378, 'support devices': 914161, 'devices acceptance': 312772, 'acceptance long': 100033, 'long stalled': 582882, 'stalled incompatability': 889456, 'incompatability problems': 497216, 'problems installation': 750145, 'installation headaches': 505580, 'headaches love': 461318, 'love magazine': 587747, 'magazine writers': 595770, 'writers make': 1035979, 'make stupid': 599523, 'stupid statements': 904237, 'statements like': 893776, 'like performance': 570325, 'performance numbers': 711619, 'numbers ll': 670555, 'll list': 579441, 'list actual': 575207, 'actual performance': 104661, 'performance ranges': 711645, 'ranges convince': 776602, 'convince statement': 270743, 'statement absurd': 893397, 'absurd scsi': 98829, 'scsi ranges': 840435, 'ranges 5mb': 776600, '5mb scsi': 67962, 'scsi ii': 840364, 'ii ranges': 489042, 'ranges 40mb': 776599, '40mb ide': 57077, 'ide ranges': 486364, 'ranges 3mb': 776598, '3mb esdi': 55066, 'esdi 25mb': 366984, '25mb non': 40876, 'non standard': 664815, 'standard versions': 890209, 'versions shows': 997874, 'shows don': 860498, 'don know': 332964, 'know scsi': 546218, 'scsi scsi': 840446, 'scsi controler': 840286, 'controler chip': 269081, 'chip range': 227922, 'range 5mb': 776233, '5mb right': 67961, 'right scsi': 812363, 'scsi controller': 840288, 'controller chip': 269232, 'chip 6mb': 227593, '6mb 10mb': 74207, '10mb burst': 9786, 'burst bit': 197572, 'bit note': 178748, 'note increase': 666546, 'increase speed': 497706, 'speed mac': 884254, 'mac quadra': 593912, 'quadra uses': 767891, 'uses version': 987365, 'version scsi': 997606, 'scsi does': 840311, 'does exist': 329493, 'exist pc': 375437, 'pc use': 706692, 'use set': 983979, 'set scsi': 853074, 'scsi bit': 840250, 'bit scsi': 178881, 'scsi mode': 840401, 'mode 6mb': 632387, 'burst scsi': 197596, 'scsi 16': 840225, '16 bit': 19349, 'bit wide': 179033, 'wide fast': 1020909, 'fast mode': 389285, 'mode 12mb': 632367, '12mb 20mb': 13654, '20mb burst': 34936, 'scsi 32': 840228, '32 bit': 48564, 'fast 15': 389100, '15 20mb': 17447, '20mb 40mb': 34932, '40mb burst': 57071, 'burst data': 197576, 'data scsi': 292743, 'twice fast': 968879, 'fast esdi': 389210, 'esdi correct': 366991, 'correct scsi': 273552, 'chip scsi': 227954, 'scsi reach': 840436, 'reach 10mb': 779221, '10mb 20': 9783, 'ide 120': 486271, '120 96': 12445, '96 scsi': 89322, 'scsi facts': 840332, 'facts posted': 384529, 'posted newsgroup': 735707, 'newsgroup mac': 660319, 'mac ibm': 593809, 'ibm info': 485130, 'info sheet': 501229, 'sheet available': 856756, 'available ftp': 153656, 'ftp sumex': 416250, 'sumex aim': 911394, 'aim stanford': 115383, 'stanford edu': 890699, 'edu 36': 347903, '36 44': 51667, '44 info': 58578, 'info mac': 501075, 'mac report': 593922, 'report mac': 799558, 'ibm compare': 485081, 'compare version': 253377, 'version txt': 997672, 'txt 173': 969348, '173 161': 21594, '161 problem': 19977, 'problem mac': 749300, 'ibm pc': 485182, 'pc inconsiant': 706504, 'inconsiant scsi': 497332, 'scsi documented': 840310, 'documented quadra': 328558, 'quadra scsi': 767885, 'scsi chip': 840270, 'chip apple': 227610, 'apple salesperson': 130222, 'salesperson said': 829269, 'said uses': 828383, 'uses fast': 987139, 'fast scsi': 389360, 'burst does': 197580, 'does scsi': 330122, 'scsi 5mb': 840232, '5mb maximum': 67959, 'maximum synchronous': 609719, 'synchronous quadra': 920643, 'uses ansynchronous': 987054, 'ansynchronous scsi': 126826, 'scsi slower': 840455, 'slower mac': 871031, 'ibm scsi': 485219, 'scsi interface': 840373, 'interface think': 510158, 'think scsi': 942364, 'scsi maybe': 840397, 'maybe scsi': 610233, 'interface driven': 510008, 'driven machine': 338439, 'machine scsi': 594483, 'chip bit': 227626, 'bit mode': 178726, 'mode faster': 632494, 'faster true': 389642, 'true scsi': 963546, 'scsi don': 840313, 'don slam': 333347, 'slam article': 869476, 'article don': 141339, 'don understand': 333471, 'understand going': 974951, 'going reference': 435941, 'reference quadra': 788976, 'chip digital': 227702, 'digital review': 318551, 'review oct': 808848, 'oct 21': 675758, '21 1991': 35038, '1991 v8': 26651, 'v8 n33': 991525, 'n33 p8': 648301, 'holmes7000 iscsvax': 473633, 'iscsvax uni': 516034, 'uni edu': 976345, 'edu subject': 351169, 'subject win': 907856, 'win icon': 1022661, 'icon help': 485690, 'help organization': 465777, 'university northern': 978110, 'northern iowa': 665941, 'iowa lines': 514466, 'lines 10': 573460, '10 win': 7590, 'win downloaded': 1022620, 'downloaded icons': 335601, 'icons bmp': 485738, 'bmp figure': 182390, 'figure change': 396085, 'change wallpaper': 222376, 'wallpaper use': 1008722, 'use icons': 983274, 'icons help': 485751, 'help appreciated': 465256, 'appreciated thanx': 131868, 'thanx brando': 937729, 'brando ps': 189881, 'ps mail': 760510, 'kerr ux1': 537665, 'ux1 cso': 990792, 'cso uiuc': 284718, 'uiuc edu': 972425, 'edu stan': 351126, ...}
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha = 1)
nb.fit(Xtrain_tfidf, ytrain)
MultinomialNB(alpha=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB(alpha=1)
Evaluate on the test set using classification_report
We will focus on the F1-score
from sklearn.metrics import classification_report
Xtest_tfidf = tfidf.transform(Xtest)
ypred = nb.predict(Xtest_tfidf)
print(classification_report(ytest, ypred))
precision recall f1-score support 0 0.76 0.78 0.77 319 1 0.59 0.51 0.55 389 2 0.64 0.61 0.63 394 3 0.55 0.61 0.58 392 4 0.71 0.60 0.65 385 5 0.71 0.60 0.65 395 6 0.68 0.70 0.69 390 7 0.65 0.69 0.67 396 8 0.71 0.84 0.77 398 9 0.68 0.80 0.73 397 10 0.67 0.87 0.76 399 11 0.74 0.90 0.81 396 12 0.68 0.56 0.62 393 13 0.66 0.50 0.57 396 14 0.67 0.84 0.75 394 15 0.84 0.87 0.85 398 16 0.73 0.82 0.77 364 17 0.82 0.79 0.80 376 18 0.87 0.58 0.70 310 19 0.85 0.46 0.60 251 accuracy 0.70 7532 macro avg 0.71 0.70 0.70 7532 weighted avg 0.71 0.70 0.70 7532
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words = 'english')),
('nb', MultinomialNB())])
pipeline.fit(Xtrain, ytrain)
ypred = pipeline.predict(Xtest)
print(classification_report(ytest, ypred))
precision recall f1-score support 0 0.80 0.69 0.74 319 1 0.78 0.72 0.75 389 2 0.79 0.72 0.75 394 3 0.68 0.81 0.74 392 4 0.86 0.81 0.84 385 5 0.87 0.78 0.82 395 6 0.87 0.80 0.83 390 7 0.88 0.91 0.90 396 8 0.93 0.96 0.95 398 9 0.91 0.92 0.92 397 10 0.88 0.98 0.93 399 11 0.75 0.96 0.84 396 12 0.84 0.65 0.74 393 13 0.92 0.79 0.85 396 14 0.82 0.94 0.88 394 15 0.62 0.96 0.76 398 16 0.66 0.95 0.78 364 17 0.95 0.94 0.94 376 18 0.94 0.52 0.67 310 19 0.95 0.24 0.38 251 accuracy 0.82 7532 macro avg 0.84 0.80 0.80 7532 weighted avg 0.83 0.82 0.81 7532
Now we will use grid search cross-validation to find model with the best hyperparameters
from sklearn.model_selection import GridSearchCV
params = {'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
'nb__alpha': [0.01, 0.1, 1, 10]}
gridcv = GridSearchCV(pipeline, params, scoring = 'f1_macro', cv = 5)
gridcv.fit(Xtrain, ytrain)
GridSearchCV(cv=5, estimator=Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')), ('nb', MultinomialNB())]), param_grid={'nb__alpha': [0.01, 0.1, 1, 10], 'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)]}, scoring='f1_macro')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=5, estimator=Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')), ('nb', MultinomialNB())]), param_grid={'nb__alpha': [0.01, 0.1, 1, 10], 'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)]}, scoring='f1_macro')
Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')), ('nb', MultinomialNB())])
TfidfVectorizer(stop_words='english')
MultinomialNB()
gridcv.best_estimator_
Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english')), ('nb', MultinomialNB(alpha=0.01))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english')), ('nb', MultinomialNB(alpha=0.01))])
TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
MultinomialNB(alpha=0.01)
ypred = gridcv.predict(Xtest)
print(classification_report(ytest, ypred))
precision recall f1-score support 0 0.85 0.84 0.84 319 1 0.71 0.71 0.71 389 2 0.71 0.66 0.68 394 3 0.65 0.72 0.68 392 4 0.82 0.81 0.81 385 5 0.81 0.79 0.80 395 6 0.80 0.81 0.81 390 7 0.88 0.89 0.89 396 8 0.93 0.95 0.94 398 9 0.93 0.92 0.92 397 10 0.93 0.96 0.95 399 11 0.87 0.92 0.89 396 12 0.78 0.73 0.75 393 13 0.87 0.81 0.84 396 14 0.84 0.90 0.87 394 15 0.85 0.95 0.89 398 16 0.79 0.91 0.84 364 17 0.96 0.94 0.95 376 18 0.84 0.67 0.74 310 19 0.80 0.61 0.69 251 accuracy 0.83 7532 macro avg 0.83 0.82 0.83 7532 weighted avg 0.83 0.83 0.83 7532
alpha
.f1_macro
score on the test set. sklearn's
parameter name)