Commit58790b0

authored

Merge pull request#108 from gauravtrivedi967/patch-4

HindiDataCleaning.py

2 parents3e65cb5 +1308e9b commit58790b0Copy full SHA for 58790b0

File tree

1 file changed

+78

-0

lines changed

HindiDataCleaning.py

1 file changed

+78

-0

lines changed

`‎HindiDataCleaning.py‎`

Lines changed: 78 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,78 @@`
	`1`	`+#%%`
	`2`	`+'''`
	`3`	`+pip install googletrans==3.1.0a0`
	`4`	`+`
	`5`	`+from googletrans import Translator`
	`6`	`+from google.colab import files`
	`7`	`+`
	`8`	`+uploaded = files.upload()`
	`9`	`+'''`
	`10`	`+importpandasaspd`
	`11`	`+df=pd.read_csv("C:/Users/GAURAV/Downloads/data.csv")`
	`12`	`+df=df[['first_party','property_description','second_party']]`
	`13`	`+print(df.head(10))`
	`14`	`+`
	`15`	`+#pip install indic-nlp-library`
	`16`	`+`
	`17`	`+#pip install xlrd==1.2.0`
	`18`	`+`
	`19`	`+'''`
	`20`	`+from indicnlp.tokenize import indic_tokenize`
	`21`	`+def tokenization(indic_string):`
	`22`	`+ tokens = []`
	`23`	`+ for t in indic_tokenize.trivial_tokenize(indic_string):`
	`24`	`+ tokens.append(t)`
	`25`	`+ return tokens`
	`26`	`+prop=['first_party','property_description','second_party']`
	`27`	`+for i in prop:`
	`28`	`+ try:`
	`29`	`+ df[i] = df[i].apply(lambda x: tokenization(x))`
	`30`	`+ except:`
	`31`	`+ df[i] = df[i].astype(str)`
	`32`	`+ df[i] = df[i].apply(lambda x: tokenization(x))`
	`33`	`+`
	`34`	`+df['first_party'] = df['first_party'].apply(lambda x: tokenization(x))`
	`35`	`+df["second_party"] = df["second_party"].astype(str)`
	`36`	`+df['second_party'] = df['second_party'].apply(lambda x: tokenization(x))`
	`37`	`+'''`
	`38`	`+#%%`
	`39`	`+fromindicnlp.tokenizeimportindic_tokenize`
	`40`	`+deftokenization(indic_string):`
	`41`	`+tokens= []`
	`42`	`+fortinindic_tokenize.trivial_tokenize(indic_string):`
	`43`	`+tokens.append(t)`
	`44`	`+returntokens`
	`45`	`+prop=['first_party','property_description','second_party']`
	`46`	`+foriinprop:`
	`47`	`+try:`
	`48`	`+df[i]=df[i].apply(lambdax:tokenization(x))`
	`49`	+stopwords_hi= ['तुम','मेरी','मुझे','क्योंकि','हम','प्रति','अबकी','आगे','माननीय','शहर','बताएं','कौनसी','क्लिक','किसकी','बड़े','मैं','and','रही','आज','लें','आपके','मिलकर','सब','मेरे','जी','श्री','वैसा','आपका','अंदर','अत','अपना','अपनी','अपने','अभी','आदि','आप','इत्यादि','इन','इनका','इन्हीं','इन्हें','इन्हों','इस','इसका','इसकी','इसके','इसमें','इसी','इसे','उन','उनका','उनकी','उनके','उनको','उन्हीं','उन्हें','उन्हों','उस','उसके','उसी','उसे','एक','एवं','एस','ऐसे','और','कई','कर','करता','करते','करना','करने','करें','कहते','कहा','का','काफ़ी','कि','कितना','किन्हें','किन्हों','किया','किर','किस','किसी','किसे','की','कुछ','कुल','के','को','कोई','कौन','कौनसा','गया','घर','जब','जहाँ','जा','जितना','जिन','जिन्हें','जिन्हों','जिस','जिसे','जीधर','जैसा','जैसे','जो','तक','तब','तरह','तिन','तिन्हें','तिन्हों','तिस','तिसे','तो','था','थी','थे','दबारा','दिया','दुसरा','दूसरे','दो','द्वारा','न','नहीं','ना','निहायत','नीचे','ने','पर','पर','पहले','पूरा','पे','फिर','बनी','बही','बहुत','बाद','बाला','बिलकुल','भी','भीतर','मगर','मानो','मे','में','यदि','यह','यहाँ','यही','या','यिह','ये','रखें','रहा','रहे','ऱ्वासा','लिए','लिये','लेकिन','व','वर्ग','वह','वह','वहाँ','वहीं','वाले','वुह','वे','वग़ैरह','संग','सकता','सकते','सबसे','सभी','साथ','साबुत','साभ','सारा','से','सो','ही','हुआ','हुई','हुए','है','हैं','हो','होता','होती','होते','होना','होने','अपनि','जेसे','होति','सभि','तिंहों','इंहों','दवारा','इसि','किंहें','थि','उंहों','ओर','जिंहें','वहिं','अभि','बनि','हि','उंहिं','उंहें','हें','वगेरह','एसे','रवासा','कोन','निचे','काफि','उसि','पुरा','भितर','हे','बहि','वहां','कोइ','यहां','जिंहों','तिंहें','किसि','कइ','यहि','इंहिं','जिधर','इंहें','अदि','इतयादि','हुइ','कोनसा','इसकि','दुसरे','जहां','अप','किंहों','उनकि','भि','वरग','हुअ','जेसा','नहिं',"-मे "," मे ","मे.","-मे,"," मे,","-मै "," मै ","मै.","मै,","-मेस "," मेस ","-मेस."," मेस.","मेसर्स","एम/एस"]
	`50`	+stopwords_en= ['i','me','my','myself','we','our','ours','ourselves','you',"you're","you've","you'll","you'd",'your','yours','yourself','yourselves','he','him','his','himself','she',"she's",'her','hers','herself','it',"it's",'its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that',"that'll",'these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don',"don't",'should',"should've",'now','d','ll','m','o','re','ve','y','ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',"haven't",'isn',"isn't",'ma','mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't","-ms "," ms ","m/s ","m/s."]
	`51`	+punctuations= ['nn','n','।','/','`','+','\', ','?','▁(','$','@','[','_',"'",'!',',',':','^','\|',']','=','%','&','.',')','(','*','',';','-','{','}','\|','"']
	`52`	`+trial=['एकूण',"क्षेत्रफि","क्षेत्रफळ","क्षेत्र","क्षेञ"]`
	`53`	`+to_be_removed=stopwords_hi+punctuations+stopwords_en+trial`
	`54`	`+`
	`55`	`+forjinrange(len(df)):`
	`56`	`+df[i][j]=[eleforeleindf[i][j]ifelenotin (to_be_removed)]`
	`57`	`+`
	`58`	`+except:`
	`59`	`+df[i]=df[i].astype(str)`
	`60`	`+df[i]=df[i].apply(lambdax:tokenization(x))`
	`61`	+stopwords_hi= ['तुम','मेरी','मुझे','क्योंकि','हम','प्रति','अबकी','आगे','माननीय','शहर','बताएं','कौनसी','क्लिक','किसकी','बड़े','मैं','and','रही','आज','लें','आपके','मिलकर','सब','मेरे','जी','श्री','वैसा','आपका','अंदर','अत','अपना','अपनी','अपने','अभी','आदि','आप','इत्यादि','इन','इनका','इन्हीं','इन्हें','इन्हों','इस','इसका','इसकी','इसके','इसमें','इसी','इसे','उन','उनका','उनकी','उनके','उनको','उन्हीं','उन्हें','उन्हों','उस','उसके','उसी','उसे','एक','एवं','एस','ऐसे','और','कई','कर','करता','करते','करना','करने','करें','कहते','कहा','का','काफ़ी','कि','कितना','किन्हें','किन्हों','किया','किर','किस','किसी','किसे','की','कुछ','कुल','के','को','कोई','कौन','कौनसा','गया','घर','जब','जहाँ','जा','जितना','जिन','जिन्हें','जिन्हों','जिस','जिसे','जीधर','जैसा','जैसे','जो','तक','तब','तरह','तिन','तिन्हें','तिन्हों','तिस','तिसे','तो','था','थी','थे','दबारा','दिया','दुसरा','दूसरे','दो','द्वारा','न','नहीं','ना','निहायत','नीचे','ने','पर','पर','पहले','पूरा','पे','फिर','बनी','बही','बहुत','बाद','बाला','बिलकुल','भी','भीतर','मगर','मानो','मे','में','यदि','यह','यहाँ','यही','या','यिह','ये','रखें','रहा','रहे','ऱ्वासा','लिए','लिये','लेकिन','व','वर्ग','वह','वह','वहाँ','वहीं','वाले','वुह','वे','वग़ैरह','संग','सकता','सकते','सबसे','सभी','साथ','साबुत','साभ','सारा','से','सो','ही','हुआ','हुई','हुए','है','हैं','हो','होता','होती','होते','होना','होने','अपनि','जेसे','होति','सभि','तिंहों','इंहों','दवारा','इसि','किंहें','थि','उंहों','ओर','जिंहें','वहिं','अभि','बनि','हि','उंहिं','उंहें','हें','वगेरह','एसे','रवासा','कोन','निचे','काफि','उसि','पुरा','भितर','हे','बहि','वहां','कोइ','यहां','जिंहों','तिंहें','किसि','कइ','यहि','इंहिं','जिधर','इंहें','अदि','इतयादि','हुइ','कोनसा','इसकि','दुसरे','जहां','अप','किंहों','उनकि','भि','वरग','हुअ','जेसा','नहिं',"-मे "," मे ","मे.","-मे,"," मे,","-मै "," मै ","मै.","मै,","-मेस "," मेस ","-मेस."," मेस.","मेसर्स","एम/एस"]
	`62`	+stopwords_en= ['i','me','my','myself','we','our','ours','ourselves','you',"you're","you've","you'll","you'd",'your','yours','yourself','yourselves','he','him','his','himself','she',"she's",'her','hers','herself','it',"it's",'its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that',"that'll",'these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don',"don't",'should',"should've",'now','d','ll','m','o','re','ve','y','ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',"haven't",'isn',"isn't",'ma','mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't","-ms "," ms ","m/s ","m/s."]
	`63`	+punctuations= ['nn','n','।','/','`','+','\', ','?','▁(','$','@','[','_',"'",'!',',',':','^','\|',']','=','%','&','.',')','(','*','',';','-','{','}','\|','"']
	`64`	`+trial=['एकूण',"क्षेत्रफि","क्षेत्रफळ","क्षेत्र","क्षेञ"]`
	`65`	`+to_be_removed=stopwords_hi+punctuations+stopwords_en+trial`
	`66`	`+`
	`67`	`+forjinrange(len(df)):`
	`68`	`+df[i][j]=[eleforeleindf[i][j]ifelenotin (to_be_removed)]`
	`69`	`+df.head(20)`
	`70`	`+`
	`71`	`+#pip install English-to-Hindi`
	`72`	`+#%%`
	`73`	`+fromgoogletransimportTranslator`
	`74`	`+str="सलील जनार्दन बोरवंडकर"`
	`75`	`+translator=Translator()`
	`76`	`+translated=translator.translate(str,src='hi',dest='en')`
	`77`	`+print(translated.text)`
	`78`	`+# %%`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit58790b0

File tree

1 file changed

1 file changed

`‎HindiDataCleaning.py‎`

0 commit comments