{"payload":{"feedbackUrl":"https://github.com/orgs/community/discussions/53140","repo":{"id":599547518,"defaultBranch":"main","name":"vllm","ownerLogin":"vllm-project","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2023-02-09T11:23:20.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/136984999?v=4","public":true,"private":false,"isOrgOwned":true},"refInfo":{"name":"","listCacheKey":"v0:1717822267.0","currentOid":""},"activityList":{"items":[{"before":"8ea5e44a435e8731fd6f5ba4c329dd112752532a","after":"c09dade2a263b6f684d2fbf390c9c1c64761e953","ref":"refs/heads/main","pushedAt":"2024-06-08T17:54:05.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"mgoin","name":"Michael Goin","path":"/mgoin","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/3195154?s=80&v=4"},"commit":{"message":"[Misc][Breaking] Change FP8 checkpoint format from act_scale -> input_scale (#5353)","shortMessageHtmlLink":"[Misc][Breaking] Change FP8 checkpoint format from act_scale -> input…"}},{"before":"9fb900f90cbb5614c3e7d67446325ad8b7ac04b2","after":"8ea5e44a435e8731fd6f5ba4c329dd112752532a","ref":"refs/heads/main","pushedAt":"2024-06-08T08:59:21.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"youkaichao","name":"youkaichao","path":"/youkaichao","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/23236638?s=80&v=4"},"commit":{"message":"[CI/Test] improve robustness of test (vllm_runner) (#5357)\n\n[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357)","shortMessageHtmlLink":"[CI/Test] improve robustness of test (vllm_runner) (#5357)"}},{"before":"c96fc067479453b02e92d9378eeeaebb6b3816de","after":"9fb900f90cbb5614c3e7d67446325ad8b7ac04b2","ref":"refs/heads/main","pushedAt":"2024-06-08T05:31:32.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"youkaichao","name":"youkaichao","path":"/youkaichao","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/23236638?s=80&v=4"},"commit":{"message":"[CI/Test] improve robustness of test (hf_runner) (#5347)\n\n[CI/Test] improve robustness of test by replacing del with context manager (hf_runner) (#5347)","shortMessageHtmlLink":"[CI/Test] improve robustness of test (hf_runner) (#5347)"}},{"before":"7c52473c068a7b8ac325ddbed15e95962159e50e","after":null,"ref":"refs/heads/fix-logits","pushedAt":"2024-06-08T04:51:07.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"}},{"before":"c149a9819c2f83c2b8850c79d54e014d4f27e836","after":null,"ref":"refs/heads/test-acc","pushedAt":"2024-06-08T04:43:21.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"}},{"before":"97a99491116d7510504c442b03509fa13a08e244","after":null,"ref":"refs/heads/compilable-rope","pushedAt":"2024-06-08T04:43:13.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"}},{"before":"6516f6df111d22ee9215c4fa97b836851ba9e2f8","after":null,"ref":"refs/heads/integrate-flashinfer","pushedAt":"2024-06-08T04:43:08.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"}},{"before":"b3376e5c76c199acb216addec7c32ac5299bef31","after":"c96fc067479453b02e92d9378eeeaebb6b3816de","ref":"refs/heads/main","pushedAt":"2024-06-08T02:13:12.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"},"commit":{"message":"[ROCm][AMD] Use pytorch sdpa math backend to do naive attention (#4965)","shortMessageHtmlLink":"[ROCm][AMD] Use pytorch sdpa math backend to do naive attention (#4965)"}},{"before":"e69ded7d1c8a4f6ed26e64090bdc050c06cde3b9","after":"b3376e5c76c199acb216addec7c32ac5299bef31","ref":"refs/heads/main","pushedAt":"2024-06-08T01:20:16.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"DarkLight1337","name":"Cyrus Leung","path":"/DarkLight1337","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/44970335?s=80&v=4"},"commit":{"message":"[Misc] Add args for selecting distributed executor to benchmarks (#5335)","shortMessageHtmlLink":"[Misc] Add args for selecting distributed executor to benchmarks (#5335)"}},{"before":"767c727a81ae9ec570d30d55b7afc783775d5a05","after":"e69ded7d1c8a4f6ed26e64090bdc050c06cde3b9","ref":"refs/heads/main","pushedAt":"2024-06-08T00:42:05.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"pcmoritz","name":"Philipp Moritz","path":"/pcmoritz","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/113316?s=80&v=4"},"commit":{"message":"[Bug Fix] Fix the support check for FP8 CUTLASS (#5352)\n\nBug description:\r\nWith torch 2.4.0.dev20240603+cu121,\r\ncutlass_fp8_supported outputs False, and the (capability, version) before the comparison is (90, 11111111112)\r\n\r\nThis PR fixes the support check for FP8 CUTLASS ( cutlass_fp8_supported) which was introduced in https://github.com/vllm-project/vllm/pull/5183.","shortMessageHtmlLink":"[Bug Fix] Fix the support check for FP8 CUTLASS (#5352)"}},{"before":"6840a716104c8c17303b938673c2ac019e541700","after":"767c727a81ae9ec570d30d55b7afc783775d5a05","ref":"refs/heads/main","pushedAt":"2024-06-07T21:10:22.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"},"commit":{"message":"fix DbrxFusedNormAttention missing cache_config (#5340)\n\nCo-authored-by: team ","shortMessageHtmlLink":"fix DbrxFusedNormAttention missing cache_config (#5340)"}},{"before":"7a9cb294ae317b28a60165b34c8398c762869a74","after":"6840a716104c8c17303b938673c2ac019e541700","ref":"refs/heads/main","pushedAt":"2024-06-07T21:09:13.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"},"commit":{"message":"[Misc] Remove unused cuda_utils.h in CPU backend (#5345)","shortMessageHtmlLink":"[Misc] Remove unused cuda_utils.h in CPU backend (#5345)"}},{"before":"ca3ea51bde6c22d0afb3aa0a3fdba6d568095a0a","after":"7a9cb294ae317b28a60165b34c8398c762869a74","ref":"refs/heads/main","pushedAt":"2024-06-07T18:23:32.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"ywang96","name":"Roger Wang","path":"/ywang96","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/136131678?s=80&v=4"},"commit":{"message":"[Frontend] Add OpenAI Vision API Support (#5237)\n\nCo-authored-by: DarkLight1337 ","shortMessageHtmlLink":"[Frontend] Add OpenAI Vision API Support (#5237)"}},{"before":"dc49fb892ca32cb364dfc39d711ab84d3b35a28f","after":"ca3ea51bde6c22d0afb3aa0a3fdba6d568095a0a","ref":"refs/heads/main","pushedAt":"2024-06-07T16:36:26.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"},"commit":{"message":"[Kernel] Dynamic Per-Token Activation Quantization (#5037)\n\nCo-authored-by: Varun Sundar Rabindranath \r\nCo-authored-by: Varun Sundar Rabindranath ","shortMessageHtmlLink":"[Kernel] Dynamic Per-Token Activation Quantization (#5037)"}},{"before":"18a277b52dd2a64ee4c0111fc8cda126031e5889","after":"dc49fb892ca32cb364dfc39d711ab84d3b35a28f","ref":"refs/heads/main","pushedAt":"2024-06-07T13:35:42.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"DarkLight1337","name":"Cyrus Leung","path":"/DarkLight1337","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/44970335?s=80&v=4"},"commit":{"message":"Addition of lacked ignored_seq_groups in _schedule_chunked_prefill (#5296)","shortMessageHtmlLink":"Addition of lacked ignored_seq_groups in _schedule_chunked_prefill (#…"}},{"before":"8d75fe48ca5f46b7af0f5201d8500b9604eed769","after":"18a277b52dd2a64ee4c0111fc8cda126031e5889","ref":"refs/heads/main","pushedAt":"2024-06-07T10:01:56.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"Yard1","name":"Antoni Baum","path":"/Yard1","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/10364161?s=80&v=4"},"commit":{"message":"Remove Ray health check (#4693)","shortMessageHtmlLink":"Remove Ray health check (#4693)"}},{"before":"388596c91437a51d428a447594e9faec340c29b2","after":"8d75fe48ca5f46b7af0f5201d8500b9604eed769","ref":"refs/heads/main","pushedAt":"2024-06-07T08:42:35.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"pcmoritz","name":"Philipp Moritz","path":"/pcmoritz","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/113316?s=80&v=4"},"commit":{"message":"[Kernel] Switch fp8 layers to use the CUTLASS kernels (#5183)\n\nSwitching from torch._scaled_mm to vLLM's cutlass fp8 kernels when supported as we are seeing 5-15% improvement in e2e performance on neuralmagic/Meta-Llama-3-8B-Instruct-FP8\r\n\r\nsee https://docs.google.com/spreadsheets/d/1GiAnmzyGHgZ6zL_LDSTm35Bdrt4A8AaFEurDlISYYA4/ for some quick e2e benchmarks and #5144 for comparisons across different GEMM sizes.","shortMessageHtmlLink":"[Kernel] Switch fp8 layers to use the CUTLASS kernels (#5183)"}},{"before":"baa15a9ec320a6b90222df0aaed13b89e3bafc9c","after":"388596c91437a51d428a447594e9faec340c29b2","ref":"refs/heads/main","pushedAt":"2024-06-07T05:15:12.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"},"commit":{"message":"[Misc][Utils] allow get_open_port to be called for multiple times (#5333)","shortMessageHtmlLink":"[Misc][Utils] allow get_open_port to be called for multiple times (#5333"}},{"before":"15063741e30881d7a982c3436c3299a0551327dc","after":"baa15a9ec320a6b90222df0aaed13b89e3bafc9c","ref":"refs/heads/main","pushedAt":"2024-06-07T03:29:24.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"DarkLight1337","name":"Cyrus Leung","path":"/DarkLight1337","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/44970335?s=80&v=4"},"commit":{"message":"[Feature][Frontend]: Add support for `stream_options` in `ChatCompletionRequest` (#5135)","shortMessageHtmlLink":"[Feature][Frontend]: Add support for stream_options in `ChatComplet…"}},{"before":"ccdc490dda3f534c63c1faf29a638e65594d0dc3","after":"15063741e30881d7a982c3436c3299a0551327dc","ref":"refs/heads/main","pushedAt":"2024-06-07T03:17:21.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"},"commit":{"message":"[Misc] Missing error message for custom ops import (#5282)","shortMessageHtmlLink":"[Misc] Missing error message for custom ops import (#5282)"}},{"before":"a31cab7556f540b558b0b454b4a4b9b438542566","after":"ccdc490dda3f534c63c1faf29a638e65594d0dc3","ref":"refs/heads/main","pushedAt":"2024-06-07T02:07:57.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"},"commit":{"message":"[Core] Change LoRA embedding sharding to support loading methods (#5038)","shortMessageHtmlLink":"[Core] Change LoRA embedding sharding to support loading methods (#5038)"}},{"before":"3eb30a967eded5023546469cdb73d3e6a5b50ce7","after":null,"ref":"refs/heads/avoid_tokens_copy","pushedAt":"2024-06-07T02:06:14.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"}},{"before":"828da0d44e9124d949909477d6018fc08469a31e","after":"a31cab7556f540b558b0b454b4a4b9b438542566","ref":"refs/heads/main","pushedAt":"2024-06-07T01:12:00.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"zhuohan123","name":"Zhuohan Li","path":"/zhuohan123","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/17310766?s=80&v=4"},"commit":{"message":"[Core] Avoid copying prompt/output tokens if no penalties are used (#5289)","shortMessageHtmlLink":"[Core] Avoid copying prompt/output tokens if no penalties are used (#…"}},{"before":"e881c1cf9a47d8a2a75bdd7caa79692b9abec723","after":"f0d3ac9afdc0dfcba97703166e63f542fe51322e","ref":"refs/heads/torch-xla","pushedAt":"2024-06-06T22:35:12.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"},"commit":{"message":"Disable top-p sampling","shortMessageHtmlLink":"Disable top-p sampling"}},{"before":"a0701100da28ff50eb5f09e9d29cf5191e520828","after":"3eb30a967eded5023546469cdb73d3e6a5b50ce7","ref":"refs/heads/avoid_tokens_copy","pushedAt":"2024-06-06T20:51:38.000Z","pushType":"push","commitsCount":13,"pusher":{"login":"Yard1","name":"Antoni Baum","path":"/Yard1","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/10364161?s=80&v=4"},"commit":{"message":"Merge branch 'main' into avoid_tokens_copy","shortMessageHtmlLink":"Merge branch 'main' into avoid_tokens_copy"}},{"before":"abe855d63774c44e69048dfd188f0333db581d4b","after":"828da0d44e9124d949909477d6018fc08469a31e","ref":"refs/heads/main","pushedAt":"2024-06-06T20:48:14.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"simon-mo","name":"Simon Mo","path":"/simon-mo","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/21118851?s=80&v=4"},"commit":{"message":"[Frontend] enable passing multiple LoRA adapters at once to generate() (#5300)","shortMessageHtmlLink":"[Frontend] enable passing multiple LoRA adapters at once to generate() ("}},{"before":"ee01196502436b6984e78c8f40f2a12fea0caac8","after":"e881c1cf9a47d8a2a75bdd7caa79692b9abec723","ref":"refs/heads/torch-xla","pushedAt":"2024-06-06T18:20:21.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"},"commit":{"message":"MInor","shortMessageHtmlLink":"MInor"}},{"before":"79d0c43116e42f4312c4f9b253f1f8f3f0508b33","after":"a0701100da28ff50eb5f09e9d29cf5191e520828","ref":"refs/heads/avoid_tokens_copy","pushedAt":"2024-06-06T18:19:55.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"Yard1","name":"Antoni Baum","path":"/Yard1","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/10364161?s=80&v=4"},"commit":{"message":"Lint","shortMessageHtmlLink":"Lint"}},{"before":"84e4c51e301ef91a322ed2711a6664eeb6239ae3","after":"ee01196502436b6984e78c8f40f2a12fea0caac8","ref":"refs/heads/torch-xla","pushedAt":"2024-06-06T18:15:44.000Z","pushType":"push","commitsCount":4,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"},"commit":{"message":"Add top-p sampling","shortMessageHtmlLink":"Add top-p sampling"}},{"before":"4efff036f0dfeee21e82044e9b6e63b861b817a3","after":"abe855d63774c44e69048dfd188f0333db581d4b","ref":"refs/heads/main","pushedAt":"2024-06-06T16:29:29.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"WoosukKwon","name":"Woosuk Kwon","path":"/WoosukKwon","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/46394894?s=80&v=4"},"commit":{"message":"[Kernel] Retune Mixtral 8x22b configs for FP8 on H100 (#5294)","shortMessageHtmlLink":"[Kernel] Retune Mixtral 8x22b configs for FP8 on H100 (#5294)"}}],"hasNextPage":true,"hasPreviousPage":false,"activityType":"all","actor":null,"timePeriod":"all","sort":"DESC","perPage":30,"cursor":"djE6ks8AAAAEYAY3MwA","startCursor":null,"endCursor":null}},"title":"Activity · vllm-project/vllm"}