|
78 | 78 | "name":"stdout", |
79 | 79 | "output_type":"stream", |
80 | 80 | "text": [ |
81 | | -"huggingface_hub version: 0.34.4\n", |
82 | | -"tokenizers version: 0.21.4\n", |
83 | | -"torch version: 2.8.0\n" |
| 81 | +"huggingface_hub version: 0.35.0\n", |
| 82 | +"tokenizers version: 0.22.1\n", |
| 83 | +"torch version: 2.9.0+cu130\n" |
84 | 84 | ] |
85 | 85 | } |
86 | 86 | ], |
|
700 | 700 | { |
701 | 701 | "data": { |
702 | 702 | "text/plain": [ |
703 | | -"tensor([[[ 0.7500, 0.1060, 0.4844, ..., 0.9414, 0.3984, -0.2324],\n", |
704 | | -" [-0.3438, -0.0549, 0.8984, ..., -0.2402, 0.4570, 0.8242],\n", |
705 | | -" [-0.2676, -0.3281, 0.4121, ..., 0.8711, -0.9648, 0.9844]]],\n", |
| 703 | +"tensor([[[ 0.7500, 0.1011, 0.4863, ..., 0.9414, 0.3984, -0.2285],\n", |
| 704 | +" [-0.3398, -0.0564, 0.9023, ..., -0.2480, 0.4551, 0.8203],\n", |
| 705 | +" [-0.2695, -0.3242, 0.4121, ..., 0.8672, -0.9688, 0.9844]]],\n", |
706 | 706 | " dtype=torch.bfloat16, grad_fn=<UnsafeViewBackward0>)" |
707 | 707 | ] |
708 | 708 | }, |
|
806 | 806 | "metadata": { |
807 | 807 | "id":"31f12baf-f79b-499f-85c0-51328a6a20f5" |
808 | 808 | }, |
809 | | -"outputs": [], |
| 809 | +"outputs": [ |
| 810 | + { |
| 811 | +"name":"stderr", |
| 812 | +"output_type":"stream", |
| 813 | +"text": [ |
| 814 | +"/home/rasbt/jupyterlab/reasoning/.venv/lib/python3.12/site-packages/torch/cuda/__init__.py:283: UserWarning:\n", |
| 815 | +" Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.\n", |
| 816 | +" Minimum and Maximum cuda capability supported by this version of PyTorch is\n", |
| 817 | +" (8.0) - (12.0)\n", |
| 818 | +"\n", |
| 819 | +" warnings.warn(\n" |
| 820 | + ] |
| 821 | + } |
| 822 | + ], |
810 | 823 | "source": [ |
811 | 824 | "if torch.cuda.is_available():\n", |
812 | 825 | " device = torch.device(\"cuda\")\n", |
|
1038 | 1051 | "outputId":"55b2f28c-142f-4698-9d23-d27456d3ed6d" |
1039 | 1052 | }, |
1040 | 1053 | "outputs": [ |
| 1054 | + { |
| 1055 | +"data": { |
| 1056 | +"application/vnd.jupyter.widget-view+json": { |
| 1057 | +"model_id":"3396c08eab3f4cf980023483b969a337", |
| 1058 | +"version_major":2, |
| 1059 | +"version_minor":0 |
| 1060 | + }, |
| 1061 | +"text/plain": [ |
| 1062 | +"model.safetensors: 0%| | 0.00/536M [00:00<?, ?B/s]" |
| 1063 | + ] |
| 1064 | + }, |
| 1065 | +"metadata": {}, |
| 1066 | +"output_type":"display_data" |
| 1067 | + }, |
1041 | 1068 | { |
1042 | 1069 | "name":"stdout", |
1043 | 1070 | "output_type":"stream", |
|
1131 | 1158 | "execution_count":22, |
1132 | 1159 | "id":"7b6df8bc-7308-468e-93ce-2d5529ea7866", |
1133 | 1160 | "metadata": {}, |
1134 | | -"outputs": [], |
| 1161 | +"outputs": [ |
| 1162 | + { |
| 1163 | +"data": { |
| 1164 | +"application/vnd.jupyter.widget-view+json": { |
| 1165 | +"model_id":"39b7b77c5c3448cdbd48fcde4e1b1a57", |
| 1166 | +"version_major":2, |
| 1167 | +"version_minor":0 |
| 1168 | + }, |
| 1169 | +"text/plain": [ |
| 1170 | +"tokenizer.json: 0%| | 0.00/33.4M [00:00<?, ?B/s]" |
| 1171 | + ] |
| 1172 | + }, |
| 1173 | +"metadata": {}, |
| 1174 | +"output_type":"display_data" |
| 1175 | + } |
| 1176 | + ], |
1135 | 1177 | "source": [ |
1136 | 1178 | "tokenizer_file_path = os.path.join(local_dir,\"tokenizer.json\")\n", |
1137 | 1179 | "if not os.path.exists(tokenizer_file_path):\n", |
|
1195 | 1237 | }, |
1196 | 1238 | { |
1197 | 1239 | "cell_type":"code", |
1198 | | -"execution_count":25, |
| 1240 | +"execution_count":27, |
1199 | 1241 | "id":"7b8401c6-e244-4cb7-9849-2ba71ce758d5", |
1200 | 1242 | "metadata": { |
1201 | 1243 | "id":"7b8401c6-e244-4cb7-9849-2ba71ce758d5" |
1202 | 1244 | }, |
1203 | 1245 | "outputs": [], |
1204 | 1246 | "source": [ |
1205 | | -"def generate_text_basic_stream(model, token_ids, max_new_tokens,\n", |
1206 | | -" eos_token_id=None):\n", |
1207 | | -"\n", |
| 1247 | +"def generate_text_basic_stream(model, token_ids, max_new_tokens, eos_token_id=None, context_size=None):\n", |
1208 | 1248 | " model.eval()\n", |
| 1249 | +"\n", |
1209 | 1250 | " with torch.no_grad():\n", |
| 1251 | +" cache = KVCache(n_layers=model.cfg[\"n_layers\"])\n", |
| 1252 | +" model.reset_kv_cache()\n", |
| 1253 | +"\n", |
| 1254 | +" # Prime the cache with the initial context\n", |
| 1255 | +" logits = model(token_ids, cache=cache)\n", |
| 1256 | +"\n", |
1210 | 1257 | " for _ in range(max_new_tokens):\n", |
1211 | | -" out = model(token_ids)[:, -1]\n", |
1212 | | -" next_token = torch.argmax(out, dim=-1, keepdim=True)\n", |
| 1258 | +" next_token = torch.argmax(logits[:, -1], dim=-1, keepdim=True)\n", |
1213 | 1259 | "\n", |
1214 | | -" if (eos_token_id is not None\n", |
1215 | | -" and torch.all(next_token == eos_token_id)):\n", |
| 1260 | +" if eos_token_id is not None and torch.all(next_token == eos_token_id):\n", |
1216 | 1261 | " break\n", |
1217 | 1262 | "\n", |
1218 | | -" yield next_token # New: Yield each token as it's generated\n", |
1219 | | -"\n", |
1220 | | -" token_ids = torch.cat([token_ids, next_token], dim=1)" |
| 1263 | +" yield next_token\n", |
| 1264 | +"\n", |
| 1265 | +" token_ids = torch.cat([token_ids, next_token], dim=1)\n", |
| 1266 | +"\n", |
| 1267 | +" # Feed only the new token to the model; cache handles history\n", |
| 1268 | +" logits = model(next_token, cache=cache)" |
1221 | 1269 | ] |
1222 | 1270 | }, |
1223 | 1271 | { |
1224 | 1272 | "cell_type":"code", |
1225 | | -"execution_count":26, |
| 1273 | +"execution_count":28, |
1226 | 1274 | "id":"56c9d0cf-25e9-4375-8d5c-368fa6911fdf", |
1227 | 1275 | "metadata": {}, |
1228 | 1276 | "outputs": [ |
1229 | 1277 | { |
1230 | 1278 | "name":"stdout", |
1231 | 1279 | "output_type":"stream", |
1232 | 1280 | "text": [ |
1233 | | -"Large language models (LLMs) are sophisticated artificial intelligence systems that can understand, generate, and manipulate human language. They are trained on massive amounts of text data to learn patterns and relationships within language, enabling them to perform a wide range of tasks, from writing articles and answering questions to translating languages and summarizing information.\n" |
| 1281 | +"Large language models (LLMs) are sophisticated artificial intelligence systems that can understand, generate, and manipulate human language. They are trained on massive amounts of text data to learn patterns and relationships within that data, enabling them to perform a wide range of tasks, from writing articles and answering questions to translating languages and summarizing information.\n", |
| 1282 | +"\n", |
| 1283 | +"\n", |
| 1284 | +"GPU memory used: 0.96 GB\n" |
1234 | 1285 | ] |
1235 | 1286 | } |
1236 | 1287 | ], |
1237 | 1288 | "source": [ |
1238 | 1289 | "input_token_ids_tensor = torch.tensor(input_token_ids, device=device).unsqueeze(0)\n", |
1239 | 1290 | "\n", |
| 1291 | +"\n", |
| 1292 | +"if torch.cuda.is_available():\n", |
| 1293 | +" torch.cuda.reset_peak_memory_stats()\n", |
| 1294 | +"\n", |
| 1295 | +"\n", |
1240 | 1296 | "for token in generate_text_basic_stream(\n", |
1241 | 1297 | " model=model,\n", |
1242 | 1298 | " token_ids=input_token_ids_tensor,\n", |
1243 | | -" max_new_tokens=150,\n", |
| 1299 | +" max_new_tokens=500,\n", |
1244 | 1300 | " eos_token_id=tokenizer.encode(\"<end_of_turn>\")[-1]\n", |
1245 | 1301 | "):\n", |
1246 | 1302 | " token_id = token.squeeze(0).tolist()\n", |
1247 | 1303 | " print(\n", |
1248 | 1304 | " tokenizer.decode(token_id),\n", |
1249 | 1305 | " end=\"\",\n", |
1250 | 1306 | " flush=True\n", |
1251 | | -" )" |
| 1307 | +" )\n", |
| 1308 | +"\n", |
| 1309 | +"if torch.cuda.is_available():\n", |
| 1310 | +" def gpu_gb(x):\n", |
| 1311 | +" return f\"{x / 1024 / 1024 / 1024:.2f} GB\"\n", |
| 1312 | +"\n", |
| 1313 | +" print(f\"\\n\\nGPU memory used: {gpu_gb(torch.cuda.max_memory_allocated())}\")" |
1252 | 1314 | ] |
1253 | 1315 | }, |
1254 | 1316 | { |
|
1269 | 1331 | "id":"e6edaaae-2de1-406c-8ffa-897cdfa3808c" |
1270 | 1332 | }, |
1271 | 1333 | "source": [ |
1272 | | -"- Check out the [README.md](./README.md), to use this model via the `llms_from_scratch` package\n", |
1273 | 1334 | "- For those interested in a comprehensive guide on building a large language model from scratch and gaining a deeper understanding of its mechanics, you might like my [Build a Large Language Model (From Scratch)](http://mng.bz/orYv)\n", |
1274 | 1335 | "\n", |
1275 | 1336 | "<a href=\"http://mng.bz/orYv\"><img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp\" width=\"100px\"></a>" |
|
1297 | 1358 | "name":"python", |
1298 | 1359 | "nbconvert_exporter":"python", |
1299 | 1360 | "pygments_lexer":"ipython3", |
1300 | | -"version":"3.10.16" |
| 1361 | +"version":"3.12.3" |
1301 | 1362 | } |
1302 | 1363 | }, |
1303 | 1364 | "nbformat":4, |
|