Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit13c31a7

Browse files
committed
Added Image Scraper Notebooks
1 parent14de319 commit13c31a7

File tree

3 files changed

+520
-0
lines changed

3 files changed

+520
-0
lines changed

‎Image Scraper (farfetch).ipynb‎

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type":"code",
5+
"execution_count":2,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import urllib\n",
10+
"import urllib.request\n",
11+
"from bs4 import BeautifulSoup"
12+
]
13+
},
14+
{
15+
"cell_type":"code",
16+
"execution_count":3,
17+
"metadata": {},
18+
"outputs": [],
19+
"source": [
20+
"user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'\n",
21+
"\n",
22+
"headers={'User-Agent':user_agent,}\n",
23+
"\n",
24+
"def make_soup(url):\n",
25+
" request= urllib.request.Request(url, None,headers) #The assembled request\n",
26+
" thepage = urllib.request.urlopen(request)\n",
27+
" soupdata = BeautifulSoup(thepage,\"html.parser\")\n",
28+
" return soupdata"
29+
]
30+
},
31+
{
32+
"cell_type":"code",
33+
"execution_count":11,
34+
"metadata": {},
35+
"outputs": [
36+
{
37+
"data": {
38+
"text/plain": [
39+
"50"
40+
]
41+
},
42+
"execution_count":11,
43+
"metadata": {},
44+
"output_type":"execute_result"
45+
}
46+
],
47+
"source": [
48+
"urls = []\n",
49+
"site = 'https://www.farfetch.com/in/shopping/women/tops-1/items.aspx?page=2&view=90'\n",
50+
"soup = make_soup(site)\n",
51+
"it = 1\n",
52+
"for img in soup.findAll('img'):\n",
53+
" temp = img.get('src')\n",
54+
" #print(temp)\n",
55+
" if temp is not None and\"255.jpg\" in temp:\n",
56+
" urls.append(temp)\n",
57+
" it+=1\n",
58+
" #print(temp)\n",
59+
" if it > 50:\n",
60+
" break\n",
61+
"#print('Page ', i, ' done')\n",
62+
"len(urls)"
63+
]
64+
},
65+
{
66+
"cell_type":"code",
67+
"execution_count":12,
68+
"metadata": {},
69+
"outputs": [],
70+
"source": [
71+
"for i in range(len(urls)):\n",
72+
" filename = str(i+1) + '_Tops'\n",
73+
" path = 'Far test/Jackets/' + filename\n",
74+
" imagefile = open(path +\".jpeg\", 'wb')\n",
75+
" imagefile.write(urllib.request.urlopen(urls[i]).read())\n",
76+
" imagefile.close()"
77+
]
78+
},
79+
{
80+
"cell_type":"markdown",
81+
"metadata": {},
82+
"source": [
83+
"#"
84+
]
85+
}
86+
],
87+
"metadata": {
88+
"kernelspec": {
89+
"display_name":"Python 3",
90+
"language":"python",
91+
"name":"python3"
92+
},
93+
"language_info": {
94+
"codemirror_mode": {
95+
"name":"ipython",
96+
"version":3
97+
},
98+
"file_extension":".py",
99+
"mimetype":"text/x-python",
100+
"name":"python",
101+
"nbconvert_exporter":"python",
102+
"pygments_lexer":"ipython3",
103+
"version":"3.6.5"
104+
}
105+
},
106+
"nbformat":4,
107+
"nbformat_minor":2
108+
}

‎Image Scraper( ASOS ).ipynb‎

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type":"code",
5+
"execution_count":1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import urllib\n",
10+
"import urllib.request\n",
11+
"from bs4 import BeautifulSoup"
12+
]
13+
},
14+
{
15+
"cell_type":"code",
16+
"execution_count":3,
17+
"metadata": {},
18+
"outputs": [],
19+
"source": [
20+
"user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'\n",
21+
"\n",
22+
"headers={'User-Agent':user_agent,}\n",
23+
"\n",
24+
"def make_soup(url):\n",
25+
" request= urllib.request.Request(url, None,headers) #The assembled request\n",
26+
" thepage = urllib.request.urlopen(request)\n",
27+
" soupdata = BeautifulSoup(thepage,\"html.parser\")\n",
28+
" return soupdata"
29+
]
30+
},
31+
{
32+
"cell_type":"code",
33+
"execution_count":29,
34+
"metadata": {},
35+
"outputs": [
36+
{
37+
"name":"stdout",
38+
"output_type":"stream",
39+
"text": [
40+
"Page 0 done\n",
41+
"Page 1 done\n",
42+
"Page 2 done\n",
43+
"Page 3 done\n",
44+
"Page 4 done\n",
45+
"Page 5 done\n",
46+
"Page 6 done\n",
47+
"Page 7 done\n",
48+
"Page 8 done\n",
49+
"Page 9 done\n",
50+
"Page 10 done\n",
51+
"Page 11 done\n",
52+
"Page 12 done\n",
53+
"Page 13 done\n",
54+
"Page 14 done\n",
55+
"Page 15 done\n",
56+
"Page 16 done\n",
57+
"Page 17 done\n",
58+
"Page 18 done\n",
59+
"Page 19 done\n",
60+
"Page 20 done\n",
61+
"Page 21 done\n",
62+
"Page 22 done\n",
63+
"Page 23 done\n",
64+
"Page 24 done\n",
65+
"Page 25 done\n",
66+
"Page 26 done\n",
67+
"Page 27 done\n",
68+
"Page 28 done\n",
69+
"Page 29 done\n",
70+
"Page 30 done\n",
71+
"Page 31 done\n",
72+
"Page 32 done\n",
73+
"Page 33 done\n",
74+
"Page 34 done\n",
75+
"Page 35 done\n",
76+
"Page 36 done\n",
77+
"Page 37 done\n",
78+
"Page 38 done\n",
79+
"Page 39 done\n",
80+
"Page 40 done\n",
81+
"Page 41 done\n",
82+
"Page 42 done\n",
83+
"Page 43 done\n",
84+
"Page 44 done\n",
85+
"Page 45 done\n",
86+
"Page 46 done\n",
87+
"Page 47 done\n",
88+
"Page 48 done\n",
89+
"Page 49 done\n",
90+
"Page 50 done\n",
91+
"Page 51 done\n",
92+
"Page 52 done\n",
93+
"Page 53 done\n",
94+
"Page 54 done\n",
95+
"Page 55 done\n",
96+
"Page 56 done\n",
97+
"Page 57 done\n",
98+
"Page 58 done\n",
99+
"Page 59 done\n",
100+
"Page 60 done\n",
101+
"Page 61 done\n",
102+
"Page 62 done\n",
103+
"Page 63 done\n",
104+
"Page 64 done\n",
105+
"Page 65 done\n",
106+
"Page 66 done\n",
107+
"Page 67 done\n",
108+
"Page 68 done\n",
109+
"Page 69 done\n"
110+
]
111+
}
112+
],
113+
"source": [
114+
"urls = []\n",
115+
"for i in range(70):\n",
116+
" site = 'http://www.asos.com/women/tops/cat/?cid=4169&nlid=ww|clothing|shop%20by%20product&page='\n",
117+
" site = site + str(i)\n",
118+
" soup = make_soup(site)\n",
119+
" for img in soup.findAll('img'):\n",
120+
" temp = img.get('src')\n",
121+
" if temp is not None and\"products\" in temp:\n",
122+
" urls.append(temp)\n",
123+
" print('Page ', i, ' done')"
124+
]
125+
},
126+
{
127+
"cell_type":"code",
128+
"execution_count":30,
129+
"metadata": {},
130+
"outputs": [
131+
{
132+
"data": {
133+
"text/plain": [
134+
"2520"
135+
]
136+
},
137+
"execution_count":30,
138+
"metadata": {},
139+
"output_type":"execute_result"
140+
}
141+
],
142+
"source": [
143+
"len(urls)"
144+
]
145+
},
146+
{
147+
"cell_type":"code",
148+
"execution_count":31,
149+
"metadata": {},
150+
"outputs": [],
151+
"source": [
152+
"\n",
153+
"for i in range(len(urls)):\n",
154+
" filename = str(i+1) + '_Tops'\n",
155+
" path = 'ASOS/Tops/' + filename\n",
156+
" imagefile = open(path +\".jpeg\", 'wb')\n",
157+
" imagefile.write(urllib.request.urlopen(urls[i]).read())\n",
158+
" imagefile.close()"
159+
]
160+
},
161+
{
162+
"cell_type":"code",
163+
"execution_count":null,
164+
"metadata": {},
165+
"outputs": [],
166+
"source": []
167+
}
168+
],
169+
"metadata": {
170+
"kernelspec": {
171+
"display_name":"Python 3",
172+
"language":"python",
173+
"name":"python3"
174+
},
175+
"language_info": {
176+
"codemirror_mode": {
177+
"name":"ipython",
178+
"version":3
179+
},
180+
"file_extension":".py",
181+
"mimetype":"text/x-python",
182+
"name":"python",
183+
"nbconvert_exporter":"python",
184+
"pygments_lexer":"ipython3",
185+
"version":"3.6.5"
186+
}
187+
},
188+
"nbformat":4,
189+
"nbformat_minor":2
190+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp