Browse Source

[gemini] quick fix on possible async operation (#5803)

* [gemini] quick fix on possible async operation

* [gemini] quick fix on possible async operation
pull/5812/head
botbw 5 months ago committed by GitHub
parent
commit
3bcbba9262
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 9
      colossalai/zero/gemini/gemini_hook.py

9
colossalai/zero/gemini/gemini_hook.py

@ -55,6 +55,15 @@ class GeminiZeROHook(ColoParamOpHook):
)
# prefetch
if self._gemini_manager.chunk_manager._prefetch_stream is not None:
# This is when prefetch happens the first time and there is no dist.Work to sync,
# there is possibility that the optimizer haven't finish computation on default stream,
# thus we might prefetch outdated chunks there.
#
# Other than that, self._gemini_manager.wait_chunks will have synced with default stream
# by calling dist.Work.wait() and this line makes no diff.
self._gemini_manager.chunk_manager._prefetch_stream.wait_stream(torch.cuda.current_stream())
with get_accelerator().stream(self._gemini_manager.chunk_manager._prefetch_stream):
for chunk in chunks_fetch_async:
maybe_work = self._chunk_manager.access_chunk(chunk, async_access=True)

Loading…
Cancel
Save